Skip to content

Commit ca998c8

Browse files
authored
Merge pull request #938 from juanjemdIos/master
nttk to 3.9.4. Test with mockups by Priya. Contributors in reame.md
2 parents 6abf8e8 + 0dc6125 commit ca998c8

4 files changed

Lines changed: 315 additions & 43 deletions

File tree

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,9 @@ Try SOMEF in Binder with our sample notebook: [![Binder](https://mybinder.org/ba
361361

362362
If you want to contribute with a pull request, please do so by submitting it to the `dev` branch.
363363

364+
## Contributors:
365+
Priyanka O.
366+
364367
## Next features:
365368

366369
To see upcoming features, please have a look at our [open issues](https://github.com/KnowledgeCaptureAndDiscovery/somef/issues) and [milestones](https://github.com/KnowledgeCaptureAndDiscovery/somef/milestones)

poetry.lock

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/somef/process_repository.py

Lines changed: 138 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,16 @@ def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, size_limit_mb=const
5151
parsed = urlparse(url)
5252
is_api_request = "api.github.com" in parsed.netloc
5353
content_length = None
54-
# just verify size if NOT is a request to api.github.com
54+
# Check file size before downloading the full body (skip for GitHub API requests,
55+
# which are always small JSON payloads).
5556
if not is_api_request:
5657
try:
57-
head_response = requests.get(url, stream=True, allow_redirects=True, **kwargs)
58+
# head_response = requests.get(url, stream=True, allow_redirects=True, **kwargs)
59+
# Use a proper HEAD request to read only the response headers.
60+
61+
head_response = requests.head(url, allow_redirects=True,
62+
timeout=constants.DOWNLOAD_TIMEOUT_SECONDS, **kwargs)
63+
head_response.close() # release the connection back to the pool immediately
5864
content_length = head_response.headers.get("Content-Length")
5965
if content_length is not None:
6066
size_bytes = int(content_length)
@@ -789,59 +795,153 @@ def download_repository_files(owner, repo_name, default_branch, repo_type, targe
789795
return None
790796

791797

798+
# def download_github_files(directory, owner, repo_name, repo_ref, authorization):
799+
# """
800+
# Download all repository files from a GitHub repository
801+
# Parameters
802+
# ----------
803+
# repo_ref: link to branch of the repo
804+
# repo_name: name of the repo
805+
# owner: GitHub owner
806+
# directory: directory where to extract all downloaded files
807+
# authorization: GitHub authorization token
808+
809+
# Returns
810+
# -------
811+
# path to the folder where all files have been downloaded
812+
# """
813+
# # download the repo at the selected branch with the link
814+
# repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/{repo_ref}.zip"
815+
# logging.info(f"Downloading {repo_archive_url}")
816+
817+
# repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization))
818+
819+
# if repo_download is None:
820+
# logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content lenght.")
821+
# return None
822+
823+
# if repo_download.status_code == 300:
824+
# logging.warning(f"Ambiguous ref detected for {repo_ref}, trying tags/heads resolution")
825+
826+
# for ref_type in ["tags", "heads"]:
827+
# repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/refs/{ref_type}/{repo_ref}.zip"
828+
# logging.info(f"Trying to download {repo_archive_url}")
829+
830+
# repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization))
831+
832+
# if repo_download is None:
833+
# logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content length.")
834+
# return None
835+
836+
# if repo_download.status_code == 200:
837+
# break
838+
839+
# if repo_download.status_code == 404:
840+
# logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}")
841+
# repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/main.zip"
842+
# logging.info(f"Trying to download {repo_archive_url}")
843+
# repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization))
844+
# if repo_download is None:
845+
# logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content lenght.")
846+
# return None
847+
848+
# if repo_download.status_code != 200:
849+
# logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}")
850+
# return None
851+
852+
# repo_zip = repo_download.content
853+
854+
# repo_name_full = owner + "_" + repo_name
855+
# repo_zip_file = os.path.join(directory, repo_name_full + ".zip")
856+
# repo_extract_dir = os.path.join(directory, repo_name_full)
857+
858+
# with open(repo_zip_file, "wb") as f:
859+
# f.write(repo_zip)
860+
861+
# try:
862+
# with zipfile.ZipFile(repo_zip_file, "r") as zip_ref:
863+
# zip_ref.extractall(repo_extract_dir)
864+
# except zipfile.BadZipFile:
865+
# logging.error("Downloaded archive is not a valid zip (repo may be empty)")
866+
# return None
867+
868+
# repo_folders = os.listdir(repo_extract_dir)
869+
# if not repo_folders:
870+
# logging.warning("Repository archive is empty")
871+
# return None
872+
873+
# repo_dir = os.path.join(repo_extract_dir, repo_folders[0])
874+
# return repo_dir
875+
792876
def download_github_files(directory, owner, repo_name, repo_ref, authorization):
793877
"""
794-
Download all repository files from a GitHub repository
878+
Download all repository files from a GitHub repository.
879+
880+
GitHub's short-form archive URL ``/archive/{ref}.zip`` returns HTTP 300 (Multiple
881+
Choices) when the ref name is **ambiguous** — i.e. a branch and a tag share the
882+
same name (e.g. a repo whose default branch is ``v2.0`` and also has a tag called
883+
``v2.0``). In that case we must use the fully-qualified ref URLs:
884+
- ``/archive/refs/heads/{ref}.zip`` (explicit branch)
885+
- ``/archive/refs/tags/{ref}.zip`` (explicit tag)
886+
887+
We also keep the legacy ``main.zip`` fallback for repositories that renamed their
888+
default branch to ``main`` after being created with ``master`` (or vice-versa) so
889+
that the GitHub API default_branch value is momentarily stale.
890+
891+
Fallback order tried in sequence until one returns HTTP 200:
892+
1. ``/archive/{ref}.zip`` — short form, works for unambiguous refs
893+
2. ``/archive/refs/heads/{ref}.zip`` — unambiguous branch (fixes HTTP 300)
894+
3. ``/archive/refs/tags/{ref}.zip`` — unambiguous tag (fixes HTTP 300)
895+
4. ``/archive/main.zip`` — legacy branch-rename fallback
896+
795897
Parameters
796898
----------
797-
repo_ref: link to branch of the repo
899+
repo_ref: default branch (or tag) returned by the GitHub API
798900
repo_name: name of the repo
799901
owner: GitHub owner
800902
directory: directory where to extract all downloaded files
801903
authorization: GitHub authorization token
802904
803905
Returns
804906
-------
805-
path to the folder where all files have been downloaded
907+
Path to the folder where all files have been downloaded, or None on failure.
806908
"""
807-
# download the repo at the selected branch with the link
808-
repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/{repo_ref}.zip"
809-
logging.info(f"Downloading {repo_archive_url}")
810-
811-
repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization))
812909

813-
if repo_download is None:
814-
logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content lenght.")
815-
return None
816-
817-
if repo_download.status_code == 300:
818-
logging.warning(f"Ambiguous ref detected for {repo_ref}, trying tags/heads resolution")
819-
820-
for ref_type in ["tags", "heads"]:
821-
repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/refs/{ref_type}/{repo_ref}.zip"
822-
logging.info(f"Trying to download {repo_archive_url}")
823-
824-
repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization))
825-
826-
if repo_download is None:
827-
logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content length.")
828-
return None
829-
830-
if repo_download.status_code == 200:
831-
break
832-
833-
if repo_download.status_code == 404:
834-
logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}")
835-
repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/main.zip"
836-
logging.info(f"Trying to download {repo_archive_url}")
910+
# Candidate archive URLs tried in order. We start with the short form because it
911+
# works for the vast majority of repos and avoids an extra HTTP round-trip. When
912+
# that returns 300 (ambiguous ref) or 404 (ref not found), we escalate to the
913+
# fully-qualified refs/heads/ and refs/tags/ forms before falling back to main.
914+
candidate_urls = [
915+
f"https://github.com/{owner}/{repo_name}/archive/{repo_ref}.zip",
916+
f"https://github.com/{owner}/{repo_name}/archive/refs/heads/{repo_ref}.zip",
917+
f"https://github.com/{owner}/{repo_name}/archive/refs/tags/{repo_ref}.zip",
918+
f"https://github.com/{owner}/{repo_name}/archive/main.zip",
919+
]
920+
repo_download = None
921+
repo_archive_url = None
922+
for repo_archive_url in candidate_urls:
923+
logging.info(f"Downloading {repo_archive_url}")
837924
repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization))
838925
if repo_download is None:
839-
logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content lenght.")
926+
# Size limit exceeded or streaming error — no point trying other URLs
927+
logging.warning(
928+
f"Repository archive skipped due to size limit: "
929+
f"{constants.SIZE_DOWNLOAD_LIMIT_MB} MB or no content-length."
930+
)
840931
return None
841932

842-
if repo_download.status_code != 200:
843-
logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}")
844-
return None
933+
if repo_download.status_code == 200:
934+
break
935+
logging.warning(
936+
f"Archive URL {repo_archive_url} returned HTTP {repo_download.status_code}, "
937+
f"trying next fallback..."
938+
)
939+
if repo_download is None or repo_download.status_code != 200:
940+
logging.error(
941+
f"All archive download attempts failed for {owner}/{repo_name} "
942+
f"(last status: {getattr(repo_download, 'status_code', 'N/A')})"
943+
)
944+
return None
845945

846946
repo_zip = repo_download.content
847947

0 commit comments

Comments
 (0)