@@ -51,10 +51,16 @@ def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, size_limit_mb=const
5151 parsed = urlparse (url )
5252 is_api_request = "api.github.com" in parsed .netloc
5353 content_length = None
54- # just verify size if NOT is a request to api.github.com
54+ # Check file size before downloading the full body (skip for GitHub API requests,
55+ # which are always small JSON payloads).
5556 if not is_api_request :
5657 try :
57- head_response = requests .get (url , stream = True , allow_redirects = True , ** kwargs )
58+ # head_response = requests.get(url, stream=True, allow_redirects=True, **kwargs)
59+ # Use a proper HEAD request to read only the response headers.
60+
61+ head_response = requests .head (url , allow_redirects = True ,
62+ timeout = constants .DOWNLOAD_TIMEOUT_SECONDS , ** kwargs )
63+ head_response .close () # release the connection back to the pool immediately
5864 content_length = head_response .headers .get ("Content-Length" )
5965 if content_length is not None :
6066 size_bytes = int (content_length )
@@ -789,59 +795,153 @@ def download_repository_files(owner, repo_name, default_branch, repo_type, targe
789795 return None
790796
791797
798+ # def download_github_files(directory, owner, repo_name, repo_ref, authorization):
799+ # """
800+ # Download all repository files from a GitHub repository
801+ # Parameters
802+ # ----------
803+ # repo_ref: link to branch of the repo
804+ # repo_name: name of the repo
805+ # owner: GitHub owner
806+ # directory: directory where to extract all downloaded files
807+ # authorization: GitHub authorization token
808+
809+ # Returns
810+ # -------
811+ # path to the folder where all files have been downloaded
812+ # """
813+ # # download the repo at the selected branch with the link
814+ # repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/{repo_ref}.zip"
815+ # logging.info(f"Downloading {repo_archive_url}")
816+
817+ # repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization))
818+
819+ # if repo_download is None:
820+ # logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content lenght.")
821+ # return None
822+
823+ # if repo_download.status_code == 300:
824+ # logging.warning(f"Ambiguous ref detected for {repo_ref}, trying tags/heads resolution")
825+
826+ # for ref_type in ["tags", "heads"]:
827+ # repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/refs/{ref_type}/{repo_ref}.zip"
828+ # logging.info(f"Trying to download {repo_archive_url}")
829+
830+ # repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization))
831+
832+ # if repo_download is None:
833+ # logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content length.")
834+ # return None
835+
836+ # if repo_download.status_code == 200:
837+ # break
838+
839+ # if repo_download.status_code == 404:
840+ # logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}")
841+ # repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/main.zip"
842+ # logging.info(f"Trying to download {repo_archive_url}")
843+ # repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization))
844+ # if repo_download is None:
845+ # logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content lenght.")
846+ # return None
847+
848+ # if repo_download.status_code != 200:
849+ # logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}")
850+ # return None
851+
852+ # repo_zip = repo_download.content
853+
854+ # repo_name_full = owner + "_" + repo_name
855+ # repo_zip_file = os.path.join(directory, repo_name_full + ".zip")
856+ # repo_extract_dir = os.path.join(directory, repo_name_full)
857+
858+ # with open(repo_zip_file, "wb") as f:
859+ # f.write(repo_zip)
860+
861+ # try:
862+ # with zipfile.ZipFile(repo_zip_file, "r") as zip_ref:
863+ # zip_ref.extractall(repo_extract_dir)
864+ # except zipfile.BadZipFile:
865+ # logging.error("Downloaded archive is not a valid zip (repo may be empty)")
866+ # return None
867+
868+ # repo_folders = os.listdir(repo_extract_dir)
869+ # if not repo_folders:
870+ # logging.warning("Repository archive is empty")
871+ # return None
872+
873+ # repo_dir = os.path.join(repo_extract_dir, repo_folders[0])
874+ # return repo_dir
875+
792876def download_github_files (directory , owner , repo_name , repo_ref , authorization ):
793877 """
794- Download all repository files from a GitHub repository
878+ Download all repository files from a GitHub repository.
879+
880+ GitHub's short-form archive URL ``/archive/{ref}.zip`` returns HTTP 300 (Multiple
881+ Choices) when the ref name is **ambiguous** — i.e. a branch and a tag share the
882+ same name (e.g. a repo whose default branch is ``v2.0`` and also has a tag called
883+ ``v2.0``). In that case we must use the fully-qualified ref URLs:
884+ - ``/archive/refs/heads/{ref}.zip`` (explicit branch)
885+ - ``/archive/refs/tags/{ref}.zip`` (explicit tag)
886+
887+ We also keep the legacy ``main.zip`` fallback for repositories that renamed their
888+ default branch to ``main`` after being created with ``master`` (or vice-versa) so
889+ that the GitHub API default_branch value is momentarily stale.
890+
891+ Fallback order tried in sequence until one returns HTTP 200:
892+ 1. ``/archive/{ref}.zip`` — short form, works for unambiguous refs
893+ 2. ``/archive/refs/heads/{ref}.zip`` — unambiguous branch (fixes HTTP 300)
894+ 3. ``/archive/refs/tags/{ref}.zip`` — unambiguous tag (fixes HTTP 300)
895+ 4. ``/archive/main.zip`` — legacy branch-rename fallback
896+
795897 Parameters
796898 ----------
797- repo_ref: link to branch of the repo
899+ repo_ref: default branch (or tag) returned by the GitHub API
798900 repo_name: name of the repo
799901 owner: GitHub owner
800902 directory: directory where to extract all downloaded files
801903 authorization: GitHub authorization token
802904
803905 Returns
804906 -------
805- path to the folder where all files have been downloaded
907+ Path to the folder where all files have been downloaded, or None on failure.
806908 """
807- # download the repo at the selected branch with the link
808- repo_archive_url = f"https://github.com/{ owner } /{ repo_name } /archive/{ repo_ref } .zip"
809- logging .info (f"Downloading { repo_archive_url } " )
810-
811- repo_download , date = rate_limit_get (repo_archive_url , headers = header_template (authorization ))
812909
813- if repo_download is None :
814- logging .warning (f"Repository archive skipped due to size limit: { constants .SIZE_DOWNLOAD_LIMIT_MB } MB or not content lenght." )
815- return None
816-
817- if repo_download .status_code == 300 :
818- logging .warning (f"Ambiguous ref detected for { repo_ref } , trying tags/heads resolution" )
819-
820- for ref_type in ["tags" , "heads" ]:
821- repo_archive_url = f"https://github.com/{ owner } /{ repo_name } /archive/refs/{ ref_type } /{ repo_ref } .zip"
822- logging .info (f"Trying to download { repo_archive_url } " )
823-
824- repo_download , date = rate_limit_get (repo_archive_url , headers = header_template (authorization ))
825-
826- if repo_download is None :
827- logging .warning (f"Repository archive skipped due to size limit: { constants .SIZE_DOWNLOAD_LIMIT_MB } MB or not content length." )
828- return None
829-
830- if repo_download .status_code == 200 :
831- break
832-
833- if repo_download .status_code == 404 :
834- logging .error (f"Error: Archive request failed with HTTP { repo_download .status_code } " )
835- repo_archive_url = f"https://github.com/{ owner } /{ repo_name } /archive/main.zip"
836- logging .info (f"Trying to download { repo_archive_url } " )
910+ # Candidate archive URLs tried in order. We start with the short form because it
911+ # works for the vast majority of repos and avoids an extra HTTP round-trip. When
912+ # that returns 300 (ambiguous ref) or 404 (ref not found), we escalate to the
913+ # fully-qualified refs/heads/ and refs/tags/ forms before falling back to main.
914+ candidate_urls = [
915+ f"https://github.com/{ owner } /{ repo_name } /archive/{ repo_ref } .zip" ,
916+ f"https://github.com/{ owner } /{ repo_name } /archive/refs/heads/{ repo_ref } .zip" ,
917+ f"https://github.com/{ owner } /{ repo_name } /archive/refs/tags/{ repo_ref } .zip" ,
918+ f"https://github.com/{ owner } /{ repo_name } /archive/main.zip" ,
919+ ]
920+ repo_download = None
921+ repo_archive_url = None
922+ for repo_archive_url in candidate_urls :
923+ logging .info (f"Downloading { repo_archive_url } " )
837924 repo_download , date = rate_limit_get (repo_archive_url , headers = header_template (authorization ))
838925 if repo_download is None :
839- logging .warning (f"Repository archive skipped due to size limit: { constants .SIZE_DOWNLOAD_LIMIT_MB } MB or not content lenght." )
926+ # Size limit exceeded or streaming error — no point trying other URLs
927+ logging .warning (
928+ f"Repository archive skipped due to size limit: "
929+ f"{ constants .SIZE_DOWNLOAD_LIMIT_MB } MB or no content-length."
930+ )
840931 return None
841932
842- if repo_download .status_code != 200 :
843- logging .error (f"Error: Archive request failed with HTTP { repo_download .status_code } " )
844- return None
933+ if repo_download .status_code == 200 :
934+ break
935+ logging .warning (
936+ f"Archive URL { repo_archive_url } returned HTTP { repo_download .status_code } , "
937+ f"trying next fallback..."
938+ )
939+ if repo_download is None or repo_download .status_code != 200 :
940+ logging .error (
941+ f"All archive download attempts failed for { owner } /{ repo_name } "
942+ f"(last status: { getattr (repo_download , 'status_code' , 'N/A' )} )"
943+ )
944+ return None
845945
846946 repo_zip = repo_download .content
847947
0 commit comments