From 32707d8a26609200246ad2689a09bc12b48ea1b7 Mon Sep 17 00:00:00 2001 From: ziad hany Date: Fri, 13 Mar 2026 20:58:42 +0200 Subject: [PATCH 1/7] Add initial support for collecting fix commits and (PRs and issues) Signed-off-by: ziad hany --- .github/workflows/collect-fix-commits.yml | 37 ++++++ .github/workflows/collect-issues-prs.yml | 37 ++++++ .gitignore | 1 + README.md | 14 +- config/fix_commits_targets.json | 39 ++++++ config/issues_prs_targets.json | 31 +++++ fix_commits_collector.py | 147 +++++++++++++++++++++ issues_prs_collector.py | 151 ++++++++++++++++++++++ 8 files changed, 456 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/collect-fix-commits.yml create mode 100644 .github/workflows/collect-issues-prs.yml create mode 100644 .gitignore create mode 100644 config/fix_commits_targets.json create mode 100644 config/issues_prs_targets.json create mode 100644 fix_commits_collector.py create mode 100644 issues_prs_collector.py diff --git a/.github/workflows/collect-fix-commits.yml b/.github/workflows/collect-fix-commits.yml new file mode 100644 index 0000000..370350b --- /dev/null +++ b/.github/workflows/collect-fix-commits.yml @@ -0,0 +1,37 @@ +name: Hourly sync for collecting fix commits + +on: + workflow_dispatch: + schedule: + - cron: '0 * * * *' + +permissions: + contents: write + +jobs: + scheduled: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install required packages + run: pip install GitPython==3.1.46 packageurl-python==0.17.6 aboutcode.pipeline==0.2.1 + + - name: Run sync + run: python fix_commits_collector.py + + - name: Commit and push if it changed + run: |- + git config user.name "AboutCode Automation" + git config user.email "automation@aboutcode.org" + git add -A + timestamp=$(date -u) + git commit -m "$(echo -e "Sync Collecting Fix Commits: $timestamp\n\nSigned-off-by: AboutCode Automation ")" || exit 0 + git push \ No newline at end of file diff --git a/.github/workflows/collect-issues-prs.yml b/.github/workflows/collect-issues-prs.yml new file mode 100644 index 0000000..51d211b --- /dev/null +++ b/.github/workflows/collect-issues-prs.yml @@ -0,0 +1,37 @@ +name: Hourly sync for collecting issues and pull requests + +on: + workflow_dispatch: + schedule: + - cron: '0 * * * *' + +permissions: + contents: write + +jobs: + scheduled: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install required packages + run: pip install PyGithub==2.8.1 python-gitlab==8.1.0 aboutcode.pipeline==0.2.1 + + - name: Run sync + run: python collect_issues_prs.py + + - name: Commit and push if it changed + run: |- + git config user.name "AboutCode Automation" + git config user.email "automation@aboutcode.org" + git add -A + timestamp=$(date -u) + git commit -m "$(echo -e "Sync Collecting Fix Commits: $timestamp\n\nSigned-off-by: AboutCode Automation ")" || exit 0 + git push \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f10862a --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/.env diff --git a/README.md b/README.md index 01aa921..d46fd13 100644 --- a/README.md +++ b/README.md @@ -1 +1,13 @@ -# vulnerablecode-vcs-collector \ No newline at end of file +# vulnerablecode-vcs-collector + + + +## Usage + +To use the mirror, clone this repository: + +```bash +git clone https://github.com/aboutcode-data/vulnerablecode-vcs-collector.git +``` + +Once cloned, the catalog pages will be available in the `data/pages/` directory. diff --git a/config/fix_commits_targets.json b/config/fix_commits_targets.json new file mode 100644 index 0000000..ac612e8 --- /dev/null +++ b/config/fix_commits_targets.json @@ -0,0 +1,39 @@ +[ + "https://github.com/torvalds/linux", + "https://github.com/mirror/busybox", + "https://github.com/nginx/nginx", + "https://github.com/apache/tomcat", + "https://github.com/mysql/mysql-server", + "https://github.com/postgres/postgres", + "https://github.com/mongodb/mongo", + "https://github.com/redis/redis", + "https://github.com/sqlite/sqlite", + "https://github.com/php/php-src", + "https://github.com/python/cpython", + "https://github.com/ruby/ruby", + "https://github.com/golang/go", + "https://github.com/nodejs/node", + "https://github.com/rust-lang/rust", + "https://github.com/openjdk/jdk", + "https://github.com/swiftlang/swift", + "https://github.com/django/django", + "https://github.com/rails/rails", + "https://github.com/laravel/framework", + "https://github.com/spring-projects/spring-framework", + "https://github.com/facebook/react", + "https://github.com/angular/angular", + "https://github.com/WordPress/WordPress", + "https://github.com/moby/moby", + "https://github.com/kubernetes/kubernetes", + "https://gitlab.com/qemu-project/qemu", + "https://github.com/xen-project/xen", + "https://github.com/mirror/vbox", + "https://github.com/containerd/containerd", + "https://github.com/ansible/ansible", + "https://github.com/hashicorp/terraform", + "https://gitlab.com/wireshark/wireshark", + "https://github.com/the-tcpdump-group/tcpdump", + "https://github.com/git/git", + "https://github.com/jenkinsci/jenkins", + "https://gitlab.com/gitlab-org/gitlab-foss" +] \ No newline at end of file diff --git a/config/issues_prs_targets.json b/config/issues_prs_targets.json new file mode 100644 index 0000000..2bf491d --- /dev/null +++ b/config/issues_prs_targets.json @@ -0,0 +1,31 @@ +[ + "https://github.com/mirror/busybox", + "https://github.com/nginx/nginx", + "https://github.com/apache/tomcat", + "https://github.com/mongodb/mongo", + "https://github.com/redis/redis", + "https://github.com/php/php-src", + "https://github.com/python/cpython", + "https://github.com/ruby/ruby", + "https://github.com/golang/go", + "https://github.com/nodejs/node", + "https://github.com/rust-lang/rust", + "https://github.com/openjdk/jdk", + "https://github.com/swiftlang/swift", + "https://github.com/django/django", + "https://github.com/rails/rails", + "https://github.com/laravel/framework", + "https://github.com/spring-projects/spring-framework", + "https://github.com/facebook/react", + "https://github.com/angular/angular", + "https://github.com/moby/moby", + "https://github.com/kubernetes/kubernetes", + "https://github.com/containerd/containerd", + "https://github.com/ansible/ansible", + "https://github.com/hashicorp/terraform", + "https://github.com/the-tcpdump-group/tcpdump", + "https://github.com/jenkinsci/jenkins", + "https://gitlab.com/gitlab-org/gitlab-foss", + "https://gitlab.com/wireshark/wireshark", + "https://gitlab.com/qemu-project/qemu" +] \ No newline at end of file diff --git a/fix_commits_collector.py b/fix_commits_collector.py new file mode 100644 index 0000000..b43231b --- /dev/null +++ b/fix_commits_collector.py @@ -0,0 +1,147 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import hashlib +import json +import re +import shutil +import sys +import tempfile +from collections import defaultdict +from datetime import datetime, timezone +from pathlib import Path + +from aboutcode.pipeline import BasePipeline, LoopProgress +from git import Repo +from packageurl.contrib.url2purl import url2purl + + +class CollectVCSFixCommitPipeline(BasePipeline): + """ + Pipeline to collect fix commits from any git repository. + """ + + vcs_url: str + patterns: list[str] = [ + r"\bCVE-\d{4}-\d{4,19}\b", + r"GHSA-[2-9cfghjmpqrvwx]{4}-[2-9cfghjmpqrvwx]{4}-[2-9cfghjmpqrvwx]{4}", + ] + + def __init__(self, vcs_url: str, *args, **kwargs): + self.vcs_url = vcs_url + super().__init__(*args, **kwargs) + + @classmethod + def steps(cls): + return ( + cls.clone, + cls.collect_fix_commits, + cls.store_items, + cls.clean_downloads, + ) + + def log(self, message): + now_local = datetime.now(timezone.utc).astimezone() + timestamp = now_local.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] + message = f"{timestamp} {message}" + print(message) + + def clone(self): + """Clone the repository.""" + self.repo = Repo.clone_from( + url=self.vcs_url, + to_path=tempfile.mkdtemp(), + bare=True, + no_checkout=True, + multi_options=["--filter=blob:none"], + ) + + def extract_vulnerability_id(self, commit) -> list[str]: + """ + Extract vulnerability id from a commit message and returns a list of matched vulnerability IDs + """ + matches = [] + for pattern in self.patterns: + found = re.findall(pattern, commit.message, flags=re.IGNORECASE) + matches.extend(found) + return matches + + def collect_fix_commits(self): + """ + Iterate through repository commits and group them by vulnerability identifiers. + return a list with (vuln_id, [(commit_id, commit_message)]). + """ + self.log( + "Processing git repository fix commits (grouped by vulnerability IDs)." + ) + + self.collected_items = { + "vcs_url": self.vcs_url, + "vulnerabilities": defaultdict(list), + } + + already_processed = set() + for commit in self.repo.iter_commits("--all"): + matched_ids = self.extract_vulnerability_id(commit) + if not matched_ids: + continue + + commit_id = commit.hexsha + commit_message = commit.message.strip() + + for vuln_id in matched_ids: + vuln_id = vuln_id.upper() + if (vuln_id, commit_id) not in already_processed: + self.collected_items["vulnerabilities"][vuln_id].append( + {commit_id: commit_message} + ) + already_processed.add((vuln_id, commit_id)) + + self.log( + f"Found {len(self.collected_items)} vulnerabilities with related commits." + ) + self.log("Finished processing all commits.") + return self.collected_items + + def store_items(self): + """Storing collected fix commits for this repository""" + self.log("Storing collected fix commits") + purl = url2purl(self.vcs_url) + + if not (purl and purl.name) or not self.collected_items.get("vulnerabilities"): + self.log("Nothing to store for collected fix commits") + return + + vcs_url_hash = hashlib.sha256(self.vcs_url.encode("utf-8")).hexdigest()[:8] + path = Path(f"data/fix-commits/{purl.name}-{vcs_url_hash}.json") + path.parent.mkdir(parents=True, exist_ok=True) + + with open(path, "w", encoding="utf-8") as f: + json.dump(self.collected_items, f, indent=2) + return + + def clean_downloads(self): + """Cleanup any temporary repository data""" + self.log("Cleaning up local repository resources") + if hasattr(self, "repo") and self.repo.working_dir: + shutil.rmtree(path=self.repo.working_dir) + + +if __name__ == "__main__": + with open("config/fix_commits_targets.json") as f: + vcs_urls = json.load(f) + + progress = LoopProgress( + total_iterations=len(vcs_urls), + logger=print, + ) + + for vcs_url in progress.iter(vcs_urls): + status_code, error_msg = CollectVCSFixCommitPipeline(vcs_url=vcs_url).execute() + print(error_msg) + + sys.exit(0) diff --git a/issues_prs_collector.py b/issues_prs_collector.py new file mode 100644 index 0000000..cda6978 --- /dev/null +++ b/issues_prs_collector.py @@ -0,0 +1,151 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +import os +import re +import sys +from abc import abstractmethod +from collections import defaultdict +from datetime import datetime, timezone +from pathlib import Path +from urllib.parse import urlparse + +import gitlab +from aboutcode.pipeline import BasePipeline, LoopProgress +from github import Github + +github_token = os.environ.get("GITHUB_TOKEN") +gitlab_token = os.environ.get("GITLAB_TOKEN") + + +class VCSCollector(BasePipeline): + """ + Pipeline to collect GitHub/GitLab issues and PRs related to vulnerabilities. + """ + + vcs_url: str + CVE_PATTERN = re.compile(r"(CVE-\d{4}-\d+)", re.IGNORECASE) + SUPPORTED_IDENTIFIERS = ["CVE-"] + + collected_items: dict = {} + + def __init__(self, vcs_url: str, *args, **kwargs): + self.vcs_url = vcs_url + super().__init__(*args, **kwargs) + + @classmethod + def steps(cls): + return ( + cls.configure_target, + cls.fetch_entries, + cls.collect_items, + cls.store_items, + ) + + def configure_target(self): + parsed_url = urlparse(self.vcs_url) + parts = parsed_url.path.strip("/").split("/") + if len(parts) < 2: + raise ValueError(f"Invalid URL: {self.vcs_url}") + + self.repo_name = f"{parts[0]}/{parts[1]}" + + def log(self, message): + now_local = datetime.now(timezone.utc).astimezone() + timestamp = now_local.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] + message = f"{timestamp} {message}" + print(message) + + @abstractmethod + def fetch_entries(self): + raise NotImplementedError + + @abstractmethod + def collect_items(self): + raise NotImplementedError + + def store_items(self): + self.log("Storing collected fix commit results.") + repo_name = self.vcs_url.replace("https://github.com", "") + path = Path(f"data/issues-prs/{repo_name}.json") + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + serialized_data = { + cve: {i_type: list(set(urls)) for i_type, urls in type_data.items()} + for cve, type_data in self.collected_items.items() + } + + json.dump(serialized_data, f, indent=2) + return + + +class GitLabCollector(VCSCollector): + def fetch_entries(self): + """Fetch GitLab Data Entries""" + gl = gitlab.Gitlab("https://gitlab.com/", private_token=gitlab_token) + project = gl.projects.get(self.repo_name) + base_query = " ".join(self.SUPPORTED_IDENTIFIERS) + self.issues = project.search(scope="issues", search=base_query) + self.prs = project.search(scope="merge_requests", search=base_query) + + def collect_items(self): + self.collected_items = defaultdict(lambda: defaultdict(set)) + for i_type, items in [("Issue", self.issues), ("PR", self.prs)]: + for item in items: + title = item.get("title") or "" + description = item.get("description") or "" + matches = self.CVE_PATTERN.findall(title + " " + description) + for match in matches: + cve_id = match.upper() + url = item.get("web_url") + if not url: + continue + self.collected_items[cve_id][i_type].add(url) + + +class GitHubCollector(VCSCollector): + def fetch_entries(self): + """Fetch GitHub Data Entries""" + g = Github(login_or_token=github_token) + base_query = ( + f"repo:{self.repo_name} ({' OR '.join(self.SUPPORTED_IDENTIFIERS)})" + ) + self.issues = g.search_issues(f"{base_query} is:issue") + self.prs = g.search_issues(f"{base_query} is:pr") + + def collect_items(self): + self.collected_items = defaultdict(lambda: defaultdict(set)) + for i_type, items in [("Issues", self.issues), ("PRs", self.prs)]: + for item in items: + matches = self.CVE_PATTERN.findall(item.title + " " + (item.body or "")) + for match in matches: + cve_id = match.upper() + self.collected_items[cve_id][i_type].add(item.html_url) + + +if __name__ == "__main__": + with open("config/issues_prs_targets.json") as f: + vcs_urls = json.load(f) + + progress = LoopProgress( + total_iterations=len(vcs_urls), + logger=print, + ) + for vcs_url in progress.iter(vcs_urls): + if vcs_url.startswith("https://gitlab.com"): + collector = GitLabCollector(vcs_url=vcs_url) + elif vcs_url.startswith("https://github.com"): + collector = GitHubCollector(vcs_url=vcs_url) + else: + print(f"Unsupported VCS URL: {vcs_url}") + continue + + status_code, error_msg = collector.execute() + print(error_msg) + + sys.exit(0) From d341543a4de79ff4682bd6f07130e7b81105d43f Mon Sep 17 00:00:00 2001 From: ziad hany Date: Sat, 14 Mar 2026 06:18:47 +0200 Subject: [PATCH 2/7] Update issues_prs_collector to use packageurl and fix a typo Signed-off-by: ziad hany --- .github/workflows/collect-issues-prs.yml | 2 +- issues_prs_collector.py | 71 +++++++++++++----------- 2 files changed, 39 insertions(+), 34 deletions(-) diff --git a/.github/workflows/collect-issues-prs.yml b/.github/workflows/collect-issues-prs.yml index 51d211b..2f60b5c 100644 --- a/.github/workflows/collect-issues-prs.yml +++ b/.github/workflows/collect-issues-prs.yml @@ -33,5 +33,5 @@ jobs: git config user.email "automation@aboutcode.org" git add -A timestamp=$(date -u) - git commit -m "$(echo -e "Sync Collecting Fix Commits: $timestamp\n\nSigned-off-by: AboutCode Automation ")" || exit 0 + git commit -m "$(echo -e "Sync Collecting Issues and Pull requests related to vulnerabilities.: $timestamp\n\nSigned-off-by: AboutCode Automation ")" || exit 0 git push \ No newline at end of file diff --git a/issues_prs_collector.py b/issues_prs_collector.py index cda6978..5985e27 100644 --- a/issues_prs_collector.py +++ b/issues_prs_collector.py @@ -5,6 +5,7 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +import hashlib import json import os import re @@ -13,11 +14,11 @@ from collections import defaultdict from datetime import datetime, timezone from pathlib import Path -from urllib.parse import urlparse import gitlab from aboutcode.pipeline import BasePipeline, LoopProgress from github import Github +from packageurl.contrib.url2purl import url2purl github_token = os.environ.get("GITHUB_TOKEN") gitlab_token = os.environ.get("GITLAB_TOKEN") @@ -32,29 +33,24 @@ class VCSCollector(BasePipeline): CVE_PATTERN = re.compile(r"(CVE-\d{4}-\d+)", re.IGNORECASE) SUPPORTED_IDENTIFIERS = ["CVE-"] - collected_items: dict = {} - - def __init__(self, vcs_url: str, *args, **kwargs): + def __init__(self, vcs_url: str, purl, *args, **kwargs): self.vcs_url = vcs_url + self.purl = purl + self.repo_name = f"{self.purl.namespace}/{self.purl.name}" + self.collected_items = { + "vcs_url": self.vcs_url, + "vulnerabilities": defaultdict(lambda: {"Issues": [], "PRs": []}), + } super().__init__(*args, **kwargs) @classmethod def steps(cls): return ( - cls.configure_target, cls.fetch_entries, cls.collect_items, cls.store_items, ) - def configure_target(self): - parsed_url = urlparse(self.vcs_url) - parts = parsed_url.path.strip("/").split("/") - if len(parts) < 2: - raise ValueError(f"Invalid URL: {self.vcs_url}") - - self.repo_name = f"{parts[0]}/{parts[1]}" - def log(self, message): now_local = datetime.now(timezone.utc).astimezone() timestamp = now_local.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] @@ -70,18 +66,17 @@ def collect_items(self): raise NotImplementedError def store_items(self): - self.log("Storing collected fix commit results.") - repo_name = self.vcs_url.replace("https://github.com", "") - path = Path(f"data/issues-prs/{repo_name}.json") + self.log("Storing collected fix commit results") + if not self.collected_items: + self.log("No collected fix commit results") + return + + vcs_url_hash = hashlib.sha256(self.vcs_url.encode("utf-8")).hexdigest()[:8] + path = Path(f"data/issues-prs/{self.purl.name}-{vcs_url_hash}.json") + path.parent.mkdir(parents=True, exist_ok=True) with open(path, "w", encoding="utf-8") as f: - serialized_data = { - cve: {i_type: list(set(urls)) for i_type, urls in type_data.items()} - for cve, type_data in self.collected_items.items() - } - - json.dump(serialized_data, f, indent=2) - return + json.dump(self.collected_items, f, indent=2) class GitLabCollector(VCSCollector): @@ -94,18 +89,20 @@ def fetch_entries(self): self.prs = project.search(scope="merge_requests", search=base_query) def collect_items(self): - self.collected_items = defaultdict(lambda: defaultdict(set)) - for i_type, items in [("Issue", self.issues), ("PR", self.prs)]: + for i_type, items in [("Issues", self.issues), ("PRs", self.prs)]: for item in items: title = item.get("title") or "" description = item.get("description") or "" matches = self.CVE_PATTERN.findall(title + " " + description) + seen_urls = set() for match in matches: cve_id = match.upper() url = item.get("web_url") - if not url: + if not url or cve_id in seen_urls: continue - self.collected_items[cve_id][i_type].add(url) + + self.collected_items["vulnerabilities"][cve_id][i_type].append(url) + seen_urls.add(cve_id) class GitHubCollector(VCSCollector): @@ -119,13 +116,18 @@ def fetch_entries(self): self.prs = g.search_issues(f"{base_query} is:pr") def collect_items(self): - self.collected_items = defaultdict(lambda: defaultdict(set)) for i_type, items in [("Issues", self.issues), ("PRs", self.prs)]: for item in items: matches = self.CVE_PATTERN.findall(item.title + " " + (item.body or "")) + seen_urls = set() for match in matches: cve_id = match.upper() - self.collected_items[cve_id][i_type].add(item.html_url) + if not item.html_url or item.html_url in seen_urls: + continue + self.collected_items["vulnerabilities"][cve_id][i_type].append( + item.html_url + ) + seen_urls.add(item.html_url) if __name__ == "__main__": @@ -137,10 +139,13 @@ def collect_items(self): logger=print, ) for vcs_url in progress.iter(vcs_urls): - if vcs_url.startswith("https://gitlab.com"): - collector = GitLabCollector(vcs_url=vcs_url) - elif vcs_url.startswith("https://github.com"): - collector = GitHubCollector(vcs_url=vcs_url) + purl = url2purl(vcs_url) + purl_type = purl.type + + if purl_type == "gitlab": + collector = GitLabCollector(vcs_url=vcs_url, purl=purl) + elif purl_type == "github": + collector = GitHubCollector(vcs_url=vcs_url, purl=purl) else: print(f"Unsupported VCS URL: {vcs_url}") continue From 7df4ae434e4cb1902bdd7b6133ca5dafc0c7b0b6 Mon Sep 17 00:00:00 2001 From: ziad hany Date: Mon, 16 Mar 2026 16:23:30 +0200 Subject: [PATCH 3/7] Update docs, Simplify fix commits and Fix CI Signed-off-by: ziad hany --- .github/workflows/collect-issues-prs.yml | 4 +- README.md | 51 +++++++++++++++++++++++- fix_commits_collector.py | 12 ++---- issues_prs_collector.py | 16 ++++---- 4 files changed, 64 insertions(+), 19 deletions(-) diff --git a/.github/workflows/collect-issues-prs.yml b/.github/workflows/collect-issues-prs.yml index 2f60b5c..bb4cd70 100644 --- a/.github/workflows/collect-issues-prs.yml +++ b/.github/workflows/collect-issues-prs.yml @@ -22,7 +22,7 @@ jobs: python-version: '3.10' - name: Install required packages - run: pip install PyGithub==2.8.1 python-gitlab==8.1.0 aboutcode.pipeline==0.2.1 + run: pip install PyGithub==2.8.1 packageurl-python==0.17.6 python-gitlab==8.1.0 aboutcode.pipeline==0.2.1 - name: Run sync run: python collect_issues_prs.py @@ -33,5 +33,5 @@ jobs: git config user.email "automation@aboutcode.org" git add -A timestamp=$(date -u) - git commit -m "$(echo -e "Sync Collecting Issues and Pull requests related to vulnerabilities.: $timestamp\n\nSigned-off-by: AboutCode Automation ")" || exit 0 + git commit -m "$(echo -e "Sync Collecting Issues and Pull requests related to vulnerabilities: $timestamp\n\nSigned-off-by: AboutCode Automation ")" || exit 0 git push \ No newline at end of file diff --git a/README.md b/README.md index d46fd13..f9a529a 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,60 @@ # vulnerablecode-vcs-collector +Collect data ( fix commits , issues, prs ) related to vulnerabilities +#### Fix commits: +To collect fix commits we clone the target git repo and loop over every git commit message searching for ( CVE-id or GHSA-id ) +File structure: + +```json +{ + "vcs_url": "https://github.com/mirror/busybox", + "vulnerabilities": { + "CVE-2023-42363": { + "fb08d43d44d1fea1f741fafb9aa7e1958a5f69aa": "awk: fix use after free (CVE-2023-42363)\n\nfunction old new delta\nevaluate 3377 3385 +8\n\nFixes https://bugs.busybox.net/show_bug.cgi?id=15865\n\nSigned-off-by: Natanael Copa \nSigned-off-by: Denys Vlasenko " + } + } +} +``` + +#### Issues and PRs: +To collect issues and pull requests we are using Github/Gitlab API to do quick search by `CVE-` + +File structure: + +```json +{ + "vcs_url": "https://github.com/python/cpython", + "vulnerabilities": { + "CVE-2026-2297": { + "Issues": [ + "https://github.com/python/cpython/issues/145506" + ], + "PRs": [ + "https://github.com/python/cpython/pull/145514", + "https://github.com/python/cpython/pull/145516", + "https://github.com/python/cpython/pull/145515", + "https://github.com/python/cpython/pull/145507", + "https://github.com/python/cpython/pull/145512", + "https://github.com/python/cpython/pull/145513" + ] + } + } +} +``` + +### File Naming +The results are stored in a json file `{repo_name}-{repo_url_hash}.json` ex: `nginx-9251c307.json` + +**Notes:** `repo_url_hash` represents the first 8 characters of repository url `SHA-256` hash ## Usage -To use the mirror, clone this repository: +To get started, clone the repository: ```bash git clone https://github.com/aboutcode-data/vulnerablecode-vcs-collector.git ``` -Once cloned, the catalog pages will be available in the `data/pages/` directory. + +Once cloned, you can find the existing data in the `data/fix-commits` or `data/issues-prs` directory \ No newline at end of file diff --git a/fix_commits_collector.py b/fix_commits_collector.py index b43231b..4b32769 100644 --- a/fix_commits_collector.py +++ b/fix_commits_collector.py @@ -73,7 +73,6 @@ def extract_vulnerability_id(self, commit) -> list[str]: def collect_fix_commits(self): """ Iterate through repository commits and group them by vulnerability identifiers. - return a list with (vuln_id, [(commit_id, commit_message)]). """ self.log( "Processing git repository fix commits (grouped by vulnerability IDs)." @@ -81,10 +80,9 @@ def collect_fix_commits(self): self.collected_items = { "vcs_url": self.vcs_url, - "vulnerabilities": defaultdict(list), + "vulnerabilities": defaultdict(dict), } - already_processed = set() for commit in self.repo.iter_commits("--all"): matched_ids = self.extract_vulnerability_id(commit) if not matched_ids: @@ -95,11 +93,9 @@ def collect_fix_commits(self): for vuln_id in matched_ids: vuln_id = vuln_id.upper() - if (vuln_id, commit_id) not in already_processed: - self.collected_items["vulnerabilities"][vuln_id].append( - {commit_id: commit_message} - ) - already_processed.add((vuln_id, commit_id)) + self.collected_items["vulnerabilities"][vuln_id][ + commit_id + ] = commit_message self.log( f"Found {len(self.collected_items)} vulnerabilities with related commits." diff --git a/issues_prs_collector.py b/issues_prs_collector.py index 5985e27..b1f21f2 100644 --- a/issues_prs_collector.py +++ b/issues_prs_collector.py @@ -66,9 +66,9 @@ def collect_items(self): raise NotImplementedError def store_items(self): - self.log("Storing collected fix commit results") - if not self.collected_items: - self.log("No collected fix commit results") + self.log("Storing collected Issues and PRs commit results") + if not self.collected_items.get("vulnerabilities"): + self.log("No collected Issues and PRs results") return vcs_url_hash = hashlib.sha256(self.vcs_url.encode("utf-8")).hexdigest()[:8] @@ -85,8 +85,10 @@ def fetch_entries(self): gl = gitlab.Gitlab("https://gitlab.com/", private_token=gitlab_token) project = gl.projects.get(self.repo_name) base_query = " ".join(self.SUPPORTED_IDENTIFIERS) - self.issues = project.search(scope="issues", search=base_query) - self.prs = project.search(scope="merge_requests", search=base_query) + self.issues = project.search(scope="issues", search=base_query, iterator=True) + self.prs = project.search( + scope="merge_requests", search=base_query, iterator=True + ) def collect_items(self): for i_type, items in [("Issues", self.issues), ("PRs", self.prs)]: @@ -98,11 +100,11 @@ def collect_items(self): for match in matches: cve_id = match.upper() url = item.get("web_url") - if not url or cve_id in seen_urls: + if not url or url in seen_urls: continue self.collected_items["vulnerabilities"][cve_id][i_type].append(url) - seen_urls.add(cve_id) + seen_urls.add(url) class GitHubCollector(VCSCollector): From 093f1594f84f5cc9bba02d32e4c042499b3275e9 Mon Sep 17 00:00:00 2001 From: ziad hany Date: Mon, 16 Mar 2026 19:52:09 +0200 Subject: [PATCH 4/7] Rename the variable name for env secrets pipeline Signed-off-by: ziad hany --- .github/workflows/collect-issues-prs.yml | 5 ++++- issues_prs_collector.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/collect-issues-prs.yml b/.github/workflows/collect-issues-prs.yml index bb4cd70..8c4b49a 100644 --- a/.github/workflows/collect-issues-prs.yml +++ b/.github/workflows/collect-issues-prs.yml @@ -11,6 +11,9 @@ permissions: jobs: scheduled: runs-on: ubuntu-latest + env: + GITHUB_TOKEN: ${{ secrets.GH_API_TOKEN }} + GITLAB_TOKEN: ${{ secrets.GLAB_API_TOKEN }} steps: - name: Checkout repository @@ -25,7 +28,7 @@ jobs: run: pip install PyGithub==2.8.1 packageurl-python==0.17.6 python-gitlab==8.1.0 aboutcode.pipeline==0.2.1 - name: Run sync - run: python collect_issues_prs.py + run: python issues_prs_collector.py - name: Commit and push if it changed run: |- diff --git a/issues_prs_collector.py b/issues_prs_collector.py index b1f21f2..14415ab 100644 --- a/issues_prs_collector.py +++ b/issues_prs_collector.py @@ -20,8 +20,8 @@ from github import Github from packageurl.contrib.url2purl import url2purl -github_token = os.environ.get("GITHUB_TOKEN") -gitlab_token = os.environ.get("GITLAB_TOKEN") +github_token = os.environ.get("GH_API_TOKEN") +gitlab_token = os.environ.get("GLAB_API_TOKEN") class VCSCollector(BasePipeline): From 327395f4eb3dd1aef6ca1f699907576d09fbcd7e Mon Sep 17 00:00:00 2001 From: ziad hany Date: Mon, 16 Mar 2026 21:28:29 +0200 Subject: [PATCH 5/7] Update the pipeline to use python-dotenv Signed-off-by: ziad hany --- .github/workflows/collect-issues-prs.yml | 6 +++--- issues_prs_collector.py | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/workflows/collect-issues-prs.yml b/.github/workflows/collect-issues-prs.yml index 8c4b49a..8abac27 100644 --- a/.github/workflows/collect-issues-prs.yml +++ b/.github/workflows/collect-issues-prs.yml @@ -12,8 +12,8 @@ jobs: scheduled: runs-on: ubuntu-latest env: - GITHUB_TOKEN: ${{ secrets.GH_API_TOKEN }} - GITLAB_TOKEN: ${{ secrets.GLAB_API_TOKEN }} + GH_API_TOKEN: ${{ secrets.GH_API_TOKEN }} + GLAB_API_TOKEN: ${{ secrets.GLAB_API_TOKEN }} steps: - name: Checkout repository @@ -25,7 +25,7 @@ jobs: python-version: '3.10' - name: Install required packages - run: pip install PyGithub==2.8.1 packageurl-python==0.17.6 python-gitlab==8.1.0 aboutcode.pipeline==0.2.1 + run: pip install PyGithub==2.8.1 python-dotenv==1.2.2 packageurl-python==0.17.6 python-gitlab==8.1.0 aboutcode.pipeline==0.2.1 - name: Run sync run: python issues_prs_collector.py diff --git a/issues_prs_collector.py b/issues_prs_collector.py index 14415ab..66534b2 100644 --- a/issues_prs_collector.py +++ b/issues_prs_collector.py @@ -19,10 +19,12 @@ from aboutcode.pipeline import BasePipeline, LoopProgress from github import Github from packageurl.contrib.url2purl import url2purl +from dotenv import load_dotenv -github_token = os.environ.get("GH_API_TOKEN") -gitlab_token = os.environ.get("GLAB_API_TOKEN") +load_dotenv() +github_token = os.getenv("GH_API_TOKEN") +gitlab_token = os.getenv("GLAB_API_TOKEN") class VCSCollector(BasePipeline): """ From a5511f28330476ab7a1ec452a13fd4b980b53734 Mon Sep 17 00:00:00 2001 From: ziad hany Date: Tue, 17 Mar 2026 01:24:47 +0200 Subject: [PATCH 6/7] Update the docs Make sure the pipline throw error if the no token inserted Update the pipeline to use repo secrets avoid env secrets for github actions Signed-off-by: ziad hany --- .github/workflows/collect-issues-prs.yml | 6 +++--- README.md | 21 ++++++++++++++++++++- issues_prs_collector.py | 15 +++++++++++---- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/.github/workflows/collect-issues-prs.yml b/.github/workflows/collect-issues-prs.yml index 8abac27..2a3df7b 100644 --- a/.github/workflows/collect-issues-prs.yml +++ b/.github/workflows/collect-issues-prs.yml @@ -11,9 +11,6 @@ permissions: jobs: scheduled: runs-on: ubuntu-latest - env: - GH_API_TOKEN: ${{ secrets.GH_API_TOKEN }} - GLAB_API_TOKEN: ${{ secrets.GLAB_API_TOKEN }} steps: - name: Checkout repository @@ -28,6 +25,9 @@ jobs: run: pip install PyGithub==2.8.1 python-dotenv==1.2.2 packageurl-python==0.17.6 python-gitlab==8.1.0 aboutcode.pipeline==0.2.1 - name: Run sync + env: + GH_API_TOKEN: ${{ secrets.GH_API_TOKEN }} + GLAB_API_TOKEN: ${{ secrets.GLAB_API_TOKEN }} run: python issues_prs_collector.py - name: Commit and push if it changed diff --git a/README.md b/README.md index f9a529a..54da052 100644 --- a/README.md +++ b/README.md @@ -57,4 +57,23 @@ git clone https://github.com/aboutcode-data/vulnerablecode-vcs-collector.git ``` -Once cloned, you can find the existing data in the `data/fix-commits` or `data/issues-prs` directory \ No newline at end of file +Once cloned, you can find the existing data in the `data/fix-commits` or `data/issues-prs` directory + +To run the pipeline and generate new files, Create the `.env` file and add your API tokens: + +```json +GH_API_TOKEN="ghp_xxx" +GLAB_API_TOKEN="glpat-xxx" +``` + +Then, you can run the collectors using Python: + +To collect fix commits: +```bash +python fix_commits_collector.py +``` + +To collect issues and pull requests: +```bash +python issues_prs_collector.py +``` \ No newline at end of file diff --git a/issues_prs_collector.py b/issues_prs_collector.py index 66534b2..2242192 100644 --- a/issues_prs_collector.py +++ b/issues_prs_collector.py @@ -17,14 +17,12 @@ import gitlab from aboutcode.pipeline import BasePipeline, LoopProgress +from dotenv import load_dotenv from github import Github from packageurl.contrib.url2purl import url2purl -from dotenv import load_dotenv load_dotenv() -github_token = os.getenv("GH_API_TOKEN") -gitlab_token = os.getenv("GLAB_API_TOKEN") class VCSCollector(BasePipeline): """ @@ -83,7 +81,12 @@ def store_items(self): class GitLabCollector(VCSCollector): def fetch_entries(self): - """Fetch GitLab Data Entries""" + """Fetch Gitlab Data Entries""" + gitlab_token = os.getenv("GLAB_API_TOKEN") + + if not gitlab_token: + raise ValueError("GLAB_API_TOKEN environment variable not set properly") + gl = gitlab.Gitlab("https://gitlab.com/", private_token=gitlab_token) project = gl.projects.get(self.repo_name) base_query = " ".join(self.SUPPORTED_IDENTIFIERS) @@ -112,6 +115,10 @@ def collect_items(self): class GitHubCollector(VCSCollector): def fetch_entries(self): """Fetch GitHub Data Entries""" + github_token = os.getenv("GH_API_TOKEN") + if not github_token: + raise ValueError("GH_API_TOKEN environment variable not set properly") + g = Github(login_or_token=github_token) base_query = ( f"repo:{self.repo_name} ({' OR '.join(self.SUPPORTED_IDENTIFIERS)})" From a0e821fd556c222a64d787092eb511b76f92a126 Mon Sep 17 00:00:00 2001 From: ziad hany Date: Fri, 10 Apr 2026 20:27:03 +0200 Subject: [PATCH 7/7] Add a test for collect fix commits and issue prs Add more target repo for fix commits collection Signed-off-by: ziad hany --- README.md | 15 +- config/fix_commits_targets.json | 400 +++++++++++++++++++++++++++++--- test/test_fix_commits.py | 71 ++++++ test/test_issues_prs.py | 105 +++++++++ 4 files changed, 553 insertions(+), 38 deletions(-) create mode 100644 test/test_fix_commits.py create mode 100644 test/test_issues_prs.py diff --git a/README.md b/README.md index 54da052..794bd90 100644 --- a/README.md +++ b/README.md @@ -76,4 +76,17 @@ python fix_commits_collector.py To collect issues and pull requests: ```bash python issues_prs_collector.py -``` \ No newline at end of file +``` + +## Testing + +Ensure you have `pytest` installed by running this command: +```bash +pip install pytest +``` + +Then, you can run the tests using this command: +```bash +python -m pytest test/ -v +``` + diff --git a/config/fix_commits_targets.json b/config/fix_commits_targets.json index ac612e8..668f6b0 100644 --- a/config/fix_commits_targets.json +++ b/config/fix_commits_targets.json @@ -1,39 +1,365 @@ [ - "https://github.com/torvalds/linux", - "https://github.com/mirror/busybox", - "https://github.com/nginx/nginx", - "https://github.com/apache/tomcat", - "https://github.com/mysql/mysql-server", - "https://github.com/postgres/postgres", - "https://github.com/mongodb/mongo", - "https://github.com/redis/redis", - "https://github.com/sqlite/sqlite", - "https://github.com/php/php-src", - "https://github.com/python/cpython", - "https://github.com/ruby/ruby", - "https://github.com/golang/go", - "https://github.com/nodejs/node", - "https://github.com/rust-lang/rust", - "https://github.com/openjdk/jdk", - "https://github.com/swiftlang/swift", - "https://github.com/django/django", - "https://github.com/rails/rails", - "https://github.com/laravel/framework", - "https://github.com/spring-projects/spring-framework", - "https://github.com/facebook/react", - "https://github.com/angular/angular", - "https://github.com/WordPress/WordPress", - "https://github.com/moby/moby", - "https://github.com/kubernetes/kubernetes", - "https://gitlab.com/qemu-project/qemu", - "https://github.com/xen-project/xen", - "https://github.com/mirror/vbox", - "https://github.com/containerd/containerd", - "https://github.com/ansible/ansible", - "https://github.com/hashicorp/terraform", - "https://gitlab.com/wireshark/wireshark", - "https://github.com/the-tcpdump-group/tcpdump", - "https://github.com/git/git", - "https://github.com/jenkinsci/jenkins", - "https://gitlab.com/gitlab-org/gitlab-foss" + "https://github.com/apache/poi", + "https://github.com/wagtail/wagtail", + "https://github.com/github/docs", + "https://github.com/kubernetes-sigs/secrets-store-csi-driver", + "https://github.com/rancher/rancher", + "https://github.com/apache/ranger", + "https://github.com/opentofu/opentofu", + "https://github.com/apache/santuario-xml-security-java", + "https://github.com/nationalsecurityagency/ghidra", + "https://github.com/squid-cache/squid", + "https://gitlab.com/gitlab-org/gitlab", + "https://github.com/jackc/pgx", + "https://github.com/apache/ignite", + "https://github.com/webpack/webpack", + "https://github.com/python/cpython", + "https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git", + "https://github.com/apache/hive", + "https://github.com/onnx/onnx", + "https://github.com/apache/jackrabbit", + "https://github.com/postgres/postgres", + "https://github.com/rust-lang/rust", + "https://gitlab.com/samba-team/samba", + "https://github.com/kubernetes-sigs/aws-iam-authenticator", + "https://github.com/apache/logging-log4net", + "https://github.com/psf/requests", + "https://github.com/swiftlang/swift", + "https://github.com/openstack/keystone", + "https://github.com/apache/hadoop", + "https://github.com/apache/mina-sshd", + "https://github.com/nixos/nixpkgs", + "https://github.com/imagemagick/imagemagick6", + "https://android.googlesource.com/platform/external/expat", + "https://bitbucket.org/b_c/jose4j", + "https://bitbucket.org/tildeslash/monit", + "https://git.kernel.org/pub/scm/utils/util-linux/util-linux.git", + "https://github.com/ansible-collections/community.general", + "https://github.com/beego/beego", + "https://github.com/apache/cordova-android", + "https://github.com/xen-project/xen", + "https://github.com/openssh/openssh-portable", + "https://github.com/denoland/std", + "https://github.com/angular/angular-cli", + "https://github.com/sveltejs/kit", + "https://github.com/bytecodealliance/wasmtime", + "https://github.com/dart-lang/http", + "https://gitlab.com/libtiff/libtiff", + "https://github.com/apache/cxf", + "https://github.com/swagger-api/swagger-ui", + "https://github.com/golang/go", + "https://gitlab.com/cryptsetup/cryptsetup", + "https://github.com/matrix-org/synapse", + "https://github.com/apache/pulsar", + "https://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git", + "https://github.com/kubernetes/kubernetes", + "https://github.com/videolan/vlc", + "https://github.com/apache/struts", + "https://github.com/netdata/netdata", + "https://github.com/apache/maven", + "https://android.googlesource.com/platform/external/conscrypt", + "https://github.com/labstack/echo", + "https://github.com/openstack/nova", + "https://github.com/qemu/qemu", + "https://github.com/django/django", + "https://android.googlesource.com/platform/external/libjpeg-turbo", + "https://github.com/nltk/nltk", + "https://github.com/pytorch/pytorch", + "https://github.com/mirror/busybox", + "https://github.com/weechat/weechat", + "https://github.com/moby/moby", + "https://android.googlesource.com/platform/external/skia", + "https://github.com/apache/avro", + "https://github.com/twisted/twisted", + "https://github.com/pyca/pyopenssl", + "https://github.com/oven-sh/bun", + "https://github.com/capstone-engine/capstone", + "https://github.com/apache/atlas", + "https://github.com/psf/black", + "https://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git", + "https://github.com/apache/rocketmq", + "https://github.com/lxc/lxc", + "https://github.com/matplotlib/matplotlib", + "https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git", + "https://android.googlesource.com/platform/art", + "https://github.com/dart-lang/sdk", + "https://chromium.googlesource.com/webm/libwebp", + "https://github.com/sequelize/sequelize", + "https://github.com/opencv/opencv", + "https://github.com/apache/activemq-artemis", + "https://android.googlesource.com/platform/external/okhttp", + "https://github.com/openssl/openssl", + "https://github.com/pypa/virtualenv", + "https://github.com/apache/commons-beanutils", + "https://github.com/redis/redis-py", + "https://android.googlesource.com/platform/external/webkit", + "https://github.com/backstage/backstage", + "https://github.com/apache/beam", + "https://github.com/mongodb/mongo-go-driver", + "https://github.com/apache/kafka", + "https://github.com/apache/skywalking", + "https://github.com/apache/tomcat", + "https://github.com/vercel/next.js", + "https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git", + "https://github.com/netty/netty", + "https://gerrit.googlesource.com/gerrit", + "https://github.com/typeorm/typeorm", + "https://github.com/bcgit/bc-java", + "https://github.com/open-policy-agent/opa", + "https://github.com/nextcloud/android", + "https://github.com/nim-lang/nim", + "https://github.com/microsoft/playwright", + "https://github.com/ollama/ollama", + "https://gitlab.com/freetype/freetype", + "https://github.com/getsentry/sentry-java", + "https://gitlab.gnome.org/gnome/gegl", + "https://chromium.googlesource.com/chromium/src", + "https://github.com/apache/activemq", + "https://github.com/apache/logging-log4j2", + "https://android.googlesource.com/platform/build", + "https://aomedia.googlesource.com/aom", + "https://github.com/jedisct1/pure-ftpd", + "https://github.com/langchain-ai/langchain", + "https://github.com/php/php-src", + "https://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git", + "https://android.googlesource.com/platform/external/libpng", + "https://github.com/canonical/lxd", + "https://github.com/vim/vim", + "https://github.com/canonical/snapd", + "https://github.com/gnome/gnome-shell", + "https://github.com/pydantic/pydantic", + "https://github.com/matrix-org/dendrite", + "https://github.com/kubernetes/kube-state-metrics", + "https://gitlab.com/gitlab-org/gitlab-foss", + "https://github.com/openbsd/src", + "https://android.googlesource.com/platform/bionic", + "https://github.com/woocommerce/woocommerce", + "https://github.com/u-boot/u-boot", + "https://github.com/varnishcache/varnish-cache", + "https://github.com/apache/shiro", + "https://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git", + "https://github.com/sigstore/cosign", + "https://github.com/apache/airflow", + "https://gitlab.com/mailman/mailman", + "https://github.com/apache/ofbiz-framework", + "https://github.com/mermaid-js/mermaid", + "https://github.com/matrix-org/matrix-js-sdk", + "https://github.com/apache/orc", + "https://github.com/twbs/bootstrap", + "https://chromium.googlesource.com/chromium/blink", + "https://github.com/apache/accumulo", + "https://github.com/neovim/neovim", + "https://github.com/cakephp/cakephp", + "https://android.googlesource.com/platform/system/core", + "https://github.com/apache/commons-io", + "https://github.com/bitcoin/bitcoin", + "https://github.com/bytecodealliance/wasm-micro-runtime", + "https://github.com/apache/ant-ivy", + "https://github.com/hashicorp/terraform", + "https://github.com/apache/commons-collections", + "https://github.com/apache/pinot", + "https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git", + "https://github.com/puppetlabs/puppet", + "https://github.com/bagder/curl", + "https://github.com/getsentry/sentry-python", + "https://github.com/openstack/neutron", + "https://github.com/apache/commons-compress", + "https://gitlab.com/gstreamer/gstreamer", + "https://github.com/apache/storm", + "https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git", + "https://github.com/langchain-ai/langchainjs", + "https://github.com/webkit/webkit", + "https://github.com/denoland/deno", + "https://github.com/gnome/glib", + "https://github.com/apache/zookeeper", + "https://android.googlesource.com/platform/external/libvpx", + "https://gitlab.com/wireshark/wireshark", + "https://github.com/langchain-ai/langgraph", + "https://github.com/strongswan/strongswan", + "https://github.com/redis/redis", + "https://github.com/c-ares/c-ares", + "https://android.googlesource.com/platform/frameworks/base", + "https://github.com/numpy/numpy", + "https://github.com/lxc/lxd", + "https://android.googlesource.com/platform/external/boringssl", + "https://gitlab.com/libssh/libssh-mirror", + "https://github.com/apache/hbase", + "https://gitweb.gentoo.org/repo/gentoo.git", + "https://github.com/mono/mono", + "https://git.kernel.org/pub/scm/bluetooth/bluez.git", + "https://github.com/opencontainers/runc", + "https://github.com/jenkinsci/jenkins", + "https://android.googlesource.com/platform/external/freetype", + "https://github.com/apache/nifi", + "https://github.com/python-pillow/pillow", + "https://github.com/apache/arrow", + "https://github.com/apache/lucene-solr", + "https://github.com/ansible-collections/community.crypto", + "https://github.com/open-telemetry/opentelemetry-go", + "https://github.com/bitnami-labs/sealed-secrets", + "https://github.com/krb5/krb5", + "https://github.com/apache/pdfbox", + "https://github.com/quarkusio/quarkus", + "https://github.com/openstack/swift", + "https://github.com/git/git", + "https://github.com/sqlite/sqlite", + "https://github.com/imagemagick/imagemagick", + "https://github.com/mongodb/mongo", + "https://github.com/apache/subversion", + "https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git", + "https://github.com/apache/superset", + "https://github.com/openzfs/zfs", + "https://github.com/nixos/nix", + "https://android.googlesource.com/platform/frameworks/av", + "https://github.com/apache/flink", + "https://github.com/ray-project/ray", + "https://github.com/zulip/zulip", + "https://github.com/glennrp/libpng", + "https://github.com/moby/buildkit", + "https://android.googlesource.com/platform/external/libxml2", + "https://github.com/apache/tika", + "https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git", + "https://github.com/gnome/libxml2", + "https://git.kernel.org/pub/scm/git/git.git", + "https://chromium.googlesource.com/webm/libwebm", + "https://github.com/madler/zlib", + "https://github.com/minio/minio", + "https://github.com/istio/istio", + "https://github.com/brave/brave-core", + "https://gitlab.gnome.org/gnome/libxml2", + "https://github.com/bcgit/bc-csharp", + "https://github.com/github/cmark-gfm", + "https://github.com/ruby/ruby", + "https://github.com/nmap/nmap", + "https://github.com/apache/lucene", + "https://github.com/nestjs/nest", + "https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git", + "https://github.com/mariadb/server", + "https://github.com/apache/cassandra", + "https://github.com/apache/drill", + "https://github.com/apache/druid", + "https://github.com/openvpn/openvpn", + "https://github.com/apache/archiva", + "https://github.com/ansible/ansible", + "https://github.com/apache/dolphinscheduler", + "https://github.com/apache/karaf", + "https://github.com/gnome/nautilus", + "https://github.com/apache/calcite", + "https://github.com/osquery/osquery", + "https://github.com/dbt-labs/dbt-core", + "https://git.kernel.org/pub/scm/virt/kvm/kvm.git", + "https://github.com/facebook/react", + "https://github.com/radareorg/radare2", + "https://github.com/gin-gonic/gin", + "https://gitlab.com/qemu-project/qemu", + "https://github.com/dart-lang/pub", + "https://github.com/sinatra/sinatra", + "https://github.com/apache/commons-text", + "https://gitlab.gnome.org/gnome/gdk-pixbuf", + "https://github.com/nats-io/nats-server", + "https://github.com/mastodon/mastodon", + "https://github.com/urllib3/urllib3", + "https://github.com/babel/babel", + "https://github.com/open-telemetry/opentelemetry-python-contrib", + "https://github.com/spring-projects/spring-framework", + "https://github.com/caddyserver/caddy", + "https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git", + "https://github.com/mattermost/mattermost", + "https://github.com/apache/james-project", + "https://github.com/nuxt/nuxt", + "https://github.com/kubernetes/ingress-nginx", + "https://bitbucket.org/libgd/gd-libgd", + "https://github.com/apache/httpcomponents-client", + "https://github.com/apache/flume", + "https://github.com/git-for-windows/git", + "https://github.com/apache/kylin", + "https://github.com/openjdk/jdk", + "https://github.com/bpftrace/bpftrace", + "https://gitlab.gnome.org/gnome/glib", + "https://github.com/prometheus/prometheus", + "https://bitbucket.org/snakeyaml/snakeyaml", + "https://github.com/nginx/nginx", + "https://chromium.googlesource.com/v8/v8", + "https://github.com/apache/spark", + "https://github.com/influxdata/influxdb", + "https://github.com/pypa/pip", + "https://chromium.googlesource.com/chromium/third_party/ffmpeg", + "https://android.googlesource.com/platform/libcore", + "https://github.com/apache/libcloud", + "https://github.com/pyca/cryptography", + "https://github.com/stedolan/jq", + "https://github.com/sveltejs/svelte", + "https://github.com/rabbitmq/rabbitmq-java-client", + "https://android.googlesource.com/platform/cts", + "https://github.com/apache/commons-lang", + "https://github.com/laravel/framework", + "https://github.com/apache/thrift", + "https://github.com/the-tcpdump-group/tcpdump", + "https://github.com/remix-run/react-router", + "https://gitlab.gnome.org/gnome/gimp", + "https://github.com/koajs/koa", + "https://github.com/jellyfin/jellyfin-web", + "https://github.com/borgbackup/borg", + "https://bitbucket.org/connect2id/nimbus-jose-jwt", + "https://github.com/modelcontextprotocol/python-sdk", + "https://github.com/matrix-org/matrix-react-sdk", + "https://github.com/burntsushi/ripgrep", + "https://github.com/open-telemetry/opentelemetry-collector-contrib", + "https://github.com/torvalds/linux", + "https://github.com/apache/solr", + "https://github.com/github/codeql-action", + "https://github.com/getsentry/sentry-javascript", + "https://github.com/sigstore/fulcio", + "https://github.com/apache/zeppelin", + "https://github.com/ory/hydra", + "https://github.com/buildroot/buildroot", + "https://github.com/protocolbuffers/protobuf", + "https://github.com/wireshark/wireshark", + "https://github.com/github/advisory-database", + "https://github.com/github/gh-ost", + "https://github.com/markedjs/marked", + "https://github.com/opencontainers/image-spec", + "https://github.com/nlnetlabs/unbound", + "https://github.com/mitmproxy/mitmproxy", + "https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git", + "https://github.com/micropython/micropython", + "https://github.com/strapi/strapi", + "https://github.com/rails/rails", + "https://github.com/ipython/ipython", + "https://github.com/rails/rails-html-sanitizer", + "https://github.com/apache/ant", + "https://android.googlesource.com/platform/external/pdfium", + "https://android.googlesource.com/kernel/msm", + "https://github.com/go-gitea/gitea", + "https://github.com/sigstore/sigstore", + "https://github.com/sigstore/rekor", + "https://android.googlesource.com/platform/external/sqlite", + "https://github.com/mysql/mysql-server", + "https://github.com/lxml/lxml", + "https://gitlab.com/libvirt/libvirt", + "https://github.com/getsentry/sentry", + "https://github.com/jellyfin/jellyfin", + "https://github.com/kubernetes/client-go", + "https://github.com/gnome/gimp", + "https://github.com/apache/jena", + "https://github.com/apache/groovy", + "https://github.com/streamlit/streamlit", + "https://github.com/git-lfs/git-lfs", + "https://github.com/apache/commons-fileupload", + "https://github.com/torproject/tor", + "https://github.com/canonical/cloud-init", + "https://github.com/nodejs/node", + "https://github.com/apache/httpd", + "https://github.com/containerd/containerd", + "https://github.com/apache/camel", + "https://android.googlesource.com/platform/dalvik", + "https://github.com/caolan/async", + "https://android.googlesource.com/kernel/common", + "https://github.com/bootstrap-vue/bootstrap-vue", + "https://github.com/webpack/webpack-dev-server", + "https://github.com/swagger-api/swagger-codegen", + "https://github.com/open-telemetry/opentelemetry-dotnet", + "https://github.com/tokio-rs/tracing", + "https://github.com/laravel/laravel", + "https://github.com/FFmpeg/FFmpeg" ] \ No newline at end of file diff --git a/test/test_fix_commits.py b/test/test_fix_commits.py new file mode 100644 index 0000000..19d1fbb --- /dev/null +++ b/test/test_fix_commits.py @@ -0,0 +1,71 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from unittest.mock import MagicMock + +import pytest + +from fix_commits_collector import CollectVCSFixCommitPipeline + + +class TestCollectVCSFixCommitPipeline: + def test_collect_fix_commits(self): + vcs_url = "https://github.com/aboutcode-org/test" + pipeline = CollectVCSFixCommitPipeline(vcs_url=vcs_url) + + pipeline.repo = MagicMock() + commit_1 = MagicMock( + hexsha="dd7769fbc97c84545579cebf1dc4838214098a11", + message=" fixes cve-2023-40024 \n", + ) + commit_2 = MagicMock( + hexsha="ab801c46c0b0e8b921f690ea47c927379e8862a3", + message="Update README file", + ) + commit_3 = MagicMock( + hexsha="ab801c46c0b0e8b921f690ea47c927379e8862a3", + message="Patch CVE-2026-21711 and GHSA-vcqx-cqfc-xc2r", + ) + + pipeline.repo.iter_commits.return_value = [commit_1, commit_2, commit_3] + result = pipeline.collect_fix_commits() + + assert result["vcs_url"] == vcs_url + assert result["vulnerabilities"] == { + "CVE-2023-40024": { + "dd7769fbc97c84545579cebf1dc4838214098a11": "fixes cve-2023-40024" + }, + "CVE-2026-21711": { + "ab801c46c0b0e8b921f690ea47c927379e8862a3": "Patch CVE-2026-21711 and GHSA-vcqx-cqfc-xc2r" + }, + "GHSA-VCQX-CQFC-XC2R": { + "ab801c46c0b0e8b921f690ea47c927379e8862a3": "Patch CVE-2026-21711 and GHSA-vcqx-cqfc-xc2r" + }, + } + + +@pytest.mark.parametrize( + "commit_message, expected_matches", + [ + ("Update README.md with instructions", []), + ("Fixes CVE-2023-12345 in the backend", ["CVE-2023-12345"]), + ("Fix GHSA-2ggp-cmvm-f62f here", ["GHSA-2ggp-cmvm-f62f"]), + ( + "fixes cve-2026-21711 and ghsa-vcqx-cqfc-xc2r", + ["cve-2026-21711", "ghsa-vcqx-cqfc-xc2r"], + ), + ("Fix CVE-2020-123456789gff0", []), + ], +) +def test_extract_vulnerability_id(commit_message, expected_matches): + pipeline = CollectVCSFixCommitPipeline( + vcs_url="https://github.com/aboutcode-org/test" + ) + commit = MagicMock() + commit.message = commit_message + result = pipeline.extract_vulnerability_id(commit) + assert set(result) == set(expected_matches) diff --git a/test/test_issues_prs.py b/test/test_issues_prs.py new file mode 100644 index 0000000..2ea3cf0 --- /dev/null +++ b/test/test_issues_prs.py @@ -0,0 +1,105 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from unittest.mock import MagicMock, patch + +import pytest +from packageurl import PackageURL + +from issues_prs_collector import GitHubCollector, GitLabCollector + + +class TestGitHubCollector: + def setup_method(self): + purl = PackageURL(type="github", namespace="aboutcode-org", name="test") + self.github_collector = GitHubCollector( + vcs_url="https://github.com/aboutcode-org/test", purl=purl + ) + + @patch("os.getenv", return_value=None) + def test_missing_token(self, mock_getenv): + with pytest.raises( + ValueError, match="GH_API_TOKEN environment variable not set properly" + ): + self.github_collector.fetch_entries() + + def test_collect_items(self): + issue1 = MagicMock() + issue1.title = "Fix CVE-2024-1234" + issue1.body = "test description" + issue1.html_url = "https://github.com/aboutcode-org/test/issues/1" + + pr1 = MagicMock() + pr1.title = "Bump deps" + pr1.body = "Fixes CVE-2024-5678" + pr1.html_url = "https://github.com/aboutcode-org/test/pulls/1" + + self.github_collector.issues = [issue1] + self.github_collector.prs = [pr1] + + self.github_collector.collect_items() + + assert self.github_collector.collected_items["vulnerabilities"] == { + "CVE-2024-1234": { + "Issues": ["https://github.com/aboutcode-org/test/issues/1"], + "PRs": [], + }, + "CVE-2024-5678": { + "Issues": [], + "PRs": ["https://github.com/aboutcode-org/test/pulls/1"], + }, + } + + +class TestGitLabCollector: + def setup_method(self): + purl = PackageURL(type="gitlab", namespace="gitlab-org", name="gitlab-foss") + self.gitlab_collector = GitLabCollector( + vcs_url="https://gitlab.com/gitlab-org/gitlab-foss", purl=purl + ) + + @patch("os.getenv", return_value=None) + def test_missing_token(self, mock_getenv): + with pytest.raises( + ValueError, match="GLAB_API_TOKEN environment variable not set properly" + ): + self.gitlab_collector.fetch_entries() + + def test_collect_items(self): + self.gitlab_collector.issues = [ + { + "title": "Need security update for CVE-2018-11235", + "description": "At the end of May, a severe security vulnerability was discovered in Git that pertains to submodules..", + "web_url": "https://gitlab.com/gitlab-org/gitlab-foss/-/issues/29992", + }, + { + "title": "Bump KaTeX version", + "description": "No cve here", + "web_url": "https://gitlab.com/gitlab-org/gitlab-foss/-/issues/51065", + }, + ] + self.gitlab_collector.prs = [ + { + "title": "Temporarily ignore Nokogiri CVE-2016-4658", + "description": "we can't do anything about it quickly, so we'll ignore the CVE in bundle-audit.", + "web_url": "https://gitlab.com/gitlab-org/gitlab-foss/-/merge_requests/10218", + } + ] + + self.gitlab_collector.collect_items() + assert self.gitlab_collector.collected_items["vulnerabilities"] == { + "CVE-2018-11235": { + "Issues": ["https://gitlab.com/gitlab-org/gitlab-foss/-/issues/29992"], + "PRs": [], + }, + "CVE-2016-4658": { + "Issues": [], + "PRs": [ + "https://gitlab.com/gitlab-org/gitlab-foss/-/merge_requests/10218" + ], + }, + }