#!/usr/bin/env python # -*- coding: utf-8 -*- # # Finding potential software vulnerabilities from git commit messages # # Software is free software released under the "GNU Affero General Public License v3.0" # # This software is part of cve-search.org # # Copyright (c) 2019-2020 Alexandre Dulaunoy - a@foo.be import git import re import sys import typing from langdetect import detect as langdetect from git_vuln_finder import get_patterns def find(repo, tags_matching=False, commit_state="under-review", verbose=False, defaultpattern="all"): # Initialization of the variables for the results repo = git.Repo(repo) found = 0 all_potential_vulnerabilities = {} all_cve_found = set() # Initialization of the patterns patterns = get_patterns() vulnpatterns = patterns["en"]["medium"]["vuln"] cryptopatterns = patterns["en"]["medium"]["crypto"] cpatterns = patterns["en"]["medium"]["c"] if defaultpattern == "vulnpatterns": defaultpattern = vulnpatterns elif defaultpattern == "cryptopatterns": defaultpattern = cryptopatterns elif defaultpattern == "cpatterns": defaultpattern = cpatterns elif defaultpattern == "all": defaultpattern = [vulnpatterns, cryptopatterns, cpatterns] repo_heads = repo.heads repo_heads_names = [h.name for h in repo_heads] print(repo_heads_names, file=sys.stderr) origin = repo.remotes.origin.url tagmap = {} if tags_matching: for t in repo.tags: tagmap.setdefault(repo.commit(t).hexsha, []).append(str(t)) for branch in repo_heads_names: commits = list(repo.iter_commits(branch)) defaultpattern for commit in commits: if isinstance(defaultpattern, typing.Pattern): ret = find_vuln(commit, pattern=defaultpattern, verbose=verbose) if ret: rcommit = ret["commit"] _, potential_vulnerabilities, cve_found = summary( repo, rcommit, branch, tagmap, defaultpattern, origin=origin, vuln_match=ret["match"], tags_matching=tags_matching, commit_state=commit_state, ) all_potential_vulnerabilities.update(potential_vulnerabilities) all_cve_found.update(cve_found) found += 1 elif isinstance(defaultpattern, list): for p in defaultpattern: ret = find_vuln(commit, pattern=p, verbose=verbose) if ret: rcommit = ret["commit"] _, potential_vulnerabilities, cve_found = summary( repo, rcommit, branch, tagmap, p, origin=origin, vuln_match=ret["match"], tags_matching=tags_matching, commit_state=commit_state, ) all_potential_vulnerabilities.update(potential_vulnerabilities) all_cve_found.update(cve_found) found += 1 return all_potential_vulnerabilities, all_cve_found, found def find_vuln(commit, pattern, verbose=False): """Find a potential vulnerability from a commit message thanks to a regex pattern. """ m = pattern.search(commit.message) if m: if verbose: print("Match found: {}".format(m.group(0)), file=sys.stderr) print(commit.message, file=sys.stderr) print("---", file=sys.stderr) ret = {} ret["commit"] = commit ret["match"] = m.groups() return ret else: return None def summary( repo, commit, branch, tagmap, pattern, origin=None, vuln_match=None, tags_matching=False, commit_state="under-review", ): potential_vulnerabilities = {} rcommit = commit cve, cve_found = extract_cve(rcommit.message) if origin is not None: origin = origin if origin.find("github.com"): origin_github_api = origin.split(":")[1] (org_name, repo_name) = origin_github_api.split("/", 1) if repo_name.find(".git$"): repo_name = re.sub(r".git$", "", repo_name) origin_github_api = "https://api.github.com/repos/{}/{}/commits/{}".format( org_name, repo_name, rcommit.hexsha ) else: origin = "git origin unknown" # deduplication if similar commits on different branches if rcommit.hexsha in potential_vulnerabilities: potential_vulnerabilities[rcommit.hexsha]["branches"].append(branch) else: potential_vulnerabilities[rcommit.hexsha] = {} potential_vulnerabilities[rcommit.hexsha]["message"] = rcommit.message potential_vulnerabilities[rcommit.hexsha]["language"] = langdetect( rcommit.message ) potential_vulnerabilities[rcommit.hexsha]["commit-id"] = rcommit.hexsha potential_vulnerabilities[rcommit.hexsha]["summary"] = rcommit.summary potential_vulnerabilities[rcommit.hexsha]["stats"] = rcommit.stats.total potential_vulnerabilities[rcommit.hexsha]["author"] = rcommit.author.name potential_vulnerabilities[rcommit.hexsha]["author-email"] = rcommit.author.email potential_vulnerabilities[rcommit.hexsha][ "authored_date" ] = rcommit.authored_date potential_vulnerabilities[rcommit.hexsha][ "committed_date" ] = rcommit.committed_date potential_vulnerabilities[rcommit.hexsha]["branches"] = [] potential_vulnerabilities[rcommit.hexsha]["branches"].append(branch) potential_vulnerabilities[rcommit.hexsha]["pattern-selected"] = pattern.pattern potential_vulnerabilities[rcommit.hexsha]["pattern-matches"] = vuln_match potential_vulnerabilities[rcommit.hexsha]["origin"] = origin if origin_github_api: potential_vulnerabilities[commit.hexsha][ "origin-github-api" ] = origin_github_api potential_vulnerabilities[rcommit.hexsha]["tags"] = [] if tags_matching: if repo.commit(rcommit).hexsha in tagmap: potential_vulnerabilities[rcommit.hexsha]["tags"] = tagmap[ repo.commit(rcommit).hexsha ] if cve: potential_vulnerabilities[rcommit.hexsha]["cve"] = cve potential_vulnerabilities[rcommit.hexsha]["state"] = "cve-assigned" else: potential_vulnerabilities[rcommit.hexsha]["state"] = commit_state return rcommit.hexsha, potential_vulnerabilities, cve_found def extract_cve(commit): cve_found = set() cve_find = re.compile(r"CVE-[1-2]\d{1,4}-\d{1,7}", re.IGNORECASE) m = cve_find.findall(commit) if m: for v in m: cve_found.add(v) return m, cve_found else: return None, set()