diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml new file mode 100644 index 0000000..1389c87 --- /dev/null +++ b/.github/workflows/pythonapp.yml @@ -0,0 +1,35 @@ +name: Python application + +on: [push] + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v1 + + - name: Set up Python 3.8 + uses: actions/setup-python@v1 + with: + python-version: 3.8 + + - name: Install dependencies + run: | + pip install poetry + poetry install + + - name: Lint with flake8 + run: | + pip install flake8 + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + + - name: Test with pytest + run: | + poetry run pytest + env: + testing: actions diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4a23fa9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,39 @@ +# use glob syntax +syntax: glob + +*.elc +*.pyc +*~ +*.db + +# Virtualenv +venv +build + +# setuptools +build/* +git_vuln_finder.egg-info/ +dist/* + +# tests +.coverage +.mypy_cache/ +.cache/ +test_repos/ + +# sphinx +docs/_build + +# Emacs +eproject.cfg + +# Temporary files (vim backups) +*.swp + +.idea/ + +# Log files: +*.log + +# Vagrant: +.vagrant/ diff --git a/README.md b/README.md index 5487ab7..e660ed4 100644 --- a/README.md +++ b/README.md @@ -2,18 +2,93 @@ ![git-vuln-finder logo](https://raw.githubusercontent.com/cve-search/git-vuln-finder/f22077452c37e110bff0564e1f7b34637dc726c3/doc/logos/git-vuln-finder-small.png) -Finding potential software vulnerabilities from git commit messages. The output format is a JSON with the associated commit which could contain a fix regarding a software vulnerability. The search is based on a set of regular expressions against the commit messages only. If CVE IDs are present, those are added automatically in the output. +[![Workflow](https://github.com/cedricbonhomme/git-vuln-finder/workflows/Python%20application/badge.svg?style=flat-square)](https://github.com/cedricbonhomme/git-vuln-finder/actions?query=workflow%3A%22Python+application%22) + +Finding potential software vulnerabilities from git commit messages. +The output format is a JSON with the associated commit which could contain a +fix regarding a software vulnerability. The search is based on a set of regular +expressions against the commit messages only. If CVE IDs are present, those are +added automatically in the output. # Requirements -- Python 3.6 -- GitPython -- langdetect +- jq (``sudo apt install jq``) -# Usage + +# Installation + +## Use it as a library ~~~bash -usage: finder.py [-h] [-v] [-r R] [-o O] [-s S] [-p P] [-c] [-t] +$ poetry install git-vuln-finder +$ poetry shell +~~~ + +You can also use ``pip``. Then just import it: + +~~~python +Python 3.8.0 (default, Dec 11 2019, 21:43:13) +[GCC 9.2.1 20191008] on linux +Type "help", "copyright", "credits" or "license" for more information. +>>> from git_vuln_finder import find +>>> all_potential_vulnerabilities, all_cve_found, found = find("~/git/curl") + +>>> [commit for commit, summary in all_potential_vulnerabilities.items() if summary['state'] == 'cve-assigned'] +['9069838b30fb3b48af0123e39f664cea683254a5', 'facb0e4662415b5f28163e853dc6742ac5fafb3d', +... snap ... + '8a75dbeb2305297640453029b7905ef51b87e8dd', '1dc43de0dccc2ea7da6dddb7b98f8d7dcf323914', '192c4f788d48f82c03e9cef40013f34370e90737', '2eb8dcf26cb37f09cffe26909a646e702dbcab66', 'fa1ae0abcde5df8d0b3283299e3f246bedf7692c', 'c11c30a8c8d727dcf5634fa0cc6ee0b4b77ddc3d', '75ca568fa1c19de4c5358fed246686de8467c238', 'a20daf90e358c1476a325ea665d533f7a27e3364', '042cc1f69ec0878f542667cb684378869f859911'] + + >>> print(json.dumps(all_potential_vulnerabilities['9069838b30fb3b48af0123e39f664cea683254a5'], sort_keys=True, indent=4, separators=(",", ": "))) + { + "author": "Daniel Stenberg", + "author-email": "daniel@haxx.se", + "authored_date": 1567544372, + "branches": [ + "master" + ], + "commit-id": "9069838b30fb3b48af0123e39f664cea683254a5", + "committed_date": 1568009674, + "cve": [ + "CVE-2019-5481", + "CVE-2019-5481" + ], + "language": "en", + "message": "security:read_data fix bad realloc()\n\n... that could end up a double-free\n\nCVE-2019-5481\nBug: https://curl.haxx.se/docs/CVE-2019-5481.html\n", + "origin": "https://github.com/curl/curl.git", + "origin-github-api": "https://api.github.com/repos///github.com/curl/curl/commits/9069838b30fb3b48af0123e39f664cea683254a5", + "pattern-matches": [ + "double-free" + ], + "pattern-selected": "(?i)(double[-| ]free|buffer overflow|double free|race[-| ]condition)", + "state": "cve-assigned", + "stats": { + "deletions": 4, + "files": 1, + "insertions": 2, + "lines": 6 + }, + "summary": "security:read_data fix bad realloc()", + "tags": [] + } +~~~ + + +## Use it as a command line tool + +~~~bash +$ pipx install git-vuln-finder +$ git-vuln-finder --help +~~~ + +You can also use pip. +``pipx`` installs scripts (system wide available) provided by Python packages +into separate virtualenvs to shield them from your system and each other. + + +### Usage + +~~~bash +usage: git-vuln-finder [-h] [-v] [-r R] [-o O] [-s S] [-p P] [-c] [-t] Finding potential software vulnerabilities from git commit messages. @@ -33,6 +108,7 @@ optional arguments: More info: https://github.com/cve-search/git-vuln-finder ~~~ + # Patterns git-vuln-finder comes with 3 default patterns which can be selected to find the potential vulnerabilities described in the commit messages such as: @@ -41,10 +117,11 @@ git-vuln-finder comes with 3 default patterns which can be selected to find the - [`cryptopatterns`](https://github.com/cve-search/git-vuln-finder/blob/master/patterns/en/medium/crypto) is a vulnerability pattern for cryptographic errors mentioned in commit messages. - [`cpatterns`](https://github.com/cve-search/git-vuln-finder/blob/master/patterns/en/medium/c) is a set of standard vulnerability patterns see for C/C++-like languages. + ## A sample partial output from Curl git repository ~~~bash -python3 finder.py -r /home/adulau/git/curl | jq . +$ git-vuln-finder -r ~/git/curl | jq . ... "6df916d751e72fc9a1febc07bb59c4ddd886c043": { "message": "loadlibrary: Only load system DLLs from the system directory\n\nInspiration provided by: Daniel Stenberg and Ray Satiro\n\nBug: https://curl.haxx.se/docs/adv_20160530.html\n\nRef: Windows DLL hijacking with curl, CVE-2016-4802\n", @@ -145,26 +222,35 @@ ploit|malicious|directory traversal |\bRCE\b|\bdos\b|\bXSRF \b|\bXSS\b|clickjack } ~~~ + +# Running the tests + +~~~bash +$ pytest +~~~ + + # License and author(s) This software is free software and licensed under the AGPL version 3. -Copyright (c) 2019 Alexandre Dulaunoy - https://github.com/adulau/ +Copyright (c) 2019-2020 Alexandre Dulaunoy - https://github.com/adulau/ + # Acknowledgment - Thanks to [Jean-Louis Huynen](https://github.com/gallypette) for the discussions about the crypto vulnerability patterns. - Thanks to [Sebastien Tricaud](https://github.com/stricaud) for the discussions regarding native language, commit messages and external patterns. + # Contributing We welcome contributions for the software and especially additional vulnerability patterns. Every contributors will be added in the [AUTHORS file](./AUTHORS) and collectively own this open source software. The contributors acknowledge the [Developer Certificate of Origin](https://developercertificate.org/). + # References - [Notes](https://gist.github.com/adulau/dce5a6ca5c65017869bb01dfee576303#file-finding-vuln-git-commit-messages-md) - https://csce.ucmss.com/cr/books/2017/LFS/CSREA2017/ICA2077.pdf (mainly using CVE referenced in the commit message) - archive (http://archive.is/xep9o) - https://asankhaya.github.io/pdf/automated-identification-of-security-issues-from-commit-messages-and-bug-reports.pdf (2 main regexps) - - diff --git a/REQUIREMENTS b/REQUIREMENTS deleted file mode 100644 index be616c3..0000000 --- a/REQUIREMENTS +++ /dev/null @@ -1,2 +0,0 @@ -gitpython -langdetect diff --git a/_config.yml b/_config.yml index 2f7efbe..fff4ab9 100644 --- a/_config.yml +++ b/_config.yml @@ -1 +1 @@ -theme: jekyll-theme-minimal \ No newline at end of file +theme: jekyll-theme-minimal diff --git a/bin/finder.py b/bin/finder.py index f7d79ab..8bce93a 100644 --- a/bin/finder.py +++ b/bin/finder.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python # -*- coding: utf-8 -*- # # Finding potential software vulnerabilities from git commit messages @@ -7,217 +7,75 @@ # # This software is part of cve-search.org # -# Copyright (c) 2019 Alexandre Dulaunoy - a@foo.be +# Copyright (c) 2019-2020 Alexandre Dulaunoy - a@foo.be -import os -import re -import git import json import sys import argparse -import typing -from langdetect import detect as langdetect -PATTERNS_PATH="../patterns" - -parser = argparse.ArgumentParser(description = "Finding potential software vulnerabilities from git commit messages.", epilog = "More info: https://github.com/cve-search/git-vuln-finder") -parser.add_argument("-v", help="increase output verbosity", action="store_true") -parser.add_argument("-r", type=str, help="git repository to analyse") -parser.add_argument("-o", type=str, help="Output format: [json]", default="json") -parser.add_argument("-s", type=str, help="State of the commit found", default="under-review") -parser.add_argument("-p", type=str, help="Matching pattern to use: [vulnpatterns, cryptopatterns, cpatterns] - the pattern 'all' is used to match all the patterns at once.", default="vulnpatterns") -parser.add_argument("-c", help="output only a list of the CVE pattern found in commit messages (disable by default)", action="store_true") -parser.add_argument("-t", help="Include tags matching a specific commit", action="store_true") -args = parser.parse_args() +from git_vuln_finder import find, find_vuln, summary -def build_pattern(pattern_file): - fp = open(pattern_file, "r") - rex = "" - try: - prefix_fp = open(pattern_file + ".prefix", "r") - rex += prefix_fp.read() - prefix_fp.close() - except: - pass - - for line in fp.readlines(): - rex += line.rstrip() + "|" - rex = rex[:-1] # We remove the extra '| - fp.close() +def main(): + """Point of entry for the script. + """ + # Parsing arguments + parser = argparse.ArgumentParser( + description="Finding potential software vulnerabilities from git commit messages.", + epilog="More info: https://github.com/cve-search/git-vuln-finder", + ) + parser.add_argument("-v", help="increase output verbosity", action="store_true") + parser.add_argument("-r", type=str, help="git repository to analyse") + parser.add_argument("-o", type=str, help="Output format: [json]", default="json") + parser.add_argument( + "-s", type=str, help="State of the commit found", default="under-review" + ) + parser.add_argument( + "-p", + type=str, + help="Matching pattern to use: [vulnpatterns, cryptopatterns, cpatterns] - the pattern 'all' is used to match all the patterns at once.", + default="vulnpatterns", + ) + parser.add_argument( + "-c", + help="output only a list of the CVE pattern found in commit messages (disable by default)", + action="store_true", + ) + parser.add_argument( + "-t", help="Include tags matching a specific commit", action="store_true" + ) + args = parser.parse_args() - try: - suffix_fp = open(pattern_file + ".suffix", "r") - rex += suffix_fp.read() - suffix_fp.close() - except: - pass - - return rex - -def get_patterns(patterns_path=PATTERNS_PATH): - patterns = {} - for root, dirs, files in os.walk(patterns_path): - path = root.split(os.sep) - for f in files: - if f.endswith(".prefix") or f.endswith(".suffix"): - continue - npath = root[len(patterns_path):].split(os.sep) - try: - npath.remove('') - except ValueError: - pass + if args.p not in ["vulnpatterns", "cryptopatterns", "cpatterns", "all"]: + parser.print_usage() + parser.exit() - lang = npath[0] - severity = npath[1] - pattern_category = f - - try: # FIXME: Is there a better way? - a = patterns[lang] - except KeyError: - patterns[lang] = {} - try: - a = patterns[lang][severity] - except KeyError: - patterns[lang][severity] = {} - try: - a = patterns[lang][severity][pattern_category] - except KeyError: - rex = build_pattern(root + os.sep + f) - patterns[lang][severity][pattern_category] = re.compile(rex) + if not args.r: + parser.print_usage() + parser.exit() - return patterns + # Launch the process + all_potential_vulnerabilities, all_cve_found, found = find( + args.r, + tags_matching=args.t, + commit_state=args.s, + verbose=args.v, + defaultpattern=args.p, + ) -patterns = get_patterns() -vulnpatterns = patterns["en"]["medium"]["vuln"] -cryptopatterns = patterns["en"]["medium"]["crypto"] -cpatterns = patterns["en"]["medium"]["c"] + # Output the result as json. Can be piped to another software. + if not args.c: + print(json.dumps(all_potential_vulnerabilities)) + elif args.c: + print(json.dumps(list(all_cve_found))) -if args.p == "vulnpatterns": - defaultpattern = vulnpatterns -elif args.p == "cryptopatterns": - defaultpattern = cryptopatterns -elif args.p == "cpatterns": - defaultpattern = cpatterns -elif args.p == "all": - defaultpattern = [vulnpatterns, cryptopatterns, cpatterns] -else: - parser.print_usage() - parser.exit() - -if not args.r: - parser.print_usage() - parser.exit() -else: - repo = git.Repo(args.r) - - -found = 0 -potential_vulnerabilities = {} -cve_found = set() - -def find_vuln(commit, pattern=vulnpatterns): - m = pattern.search(commit.message) - if m: - if args.v: - print("Match found: {}".format(m.group(0)), file=sys.stderr) - print(commit.message, file=sys.stderr) - print("---", file=sys.stderr) - ret = {} - ret['commit'] = commit - ret['match'] = m.groups() - return ret - else: - return None - -def summary(commit, branch, pattern, origin=None): - rcommit = commit - cve = extract_cve(rcommit.message) - if origin is not None: - origin = origin - if origin.find('github.com'): - origin_github_api = origin.split(':')[1] - (org_name, repo_name) = origin_github_api.split('/', 1) - if repo_name.find('.git$'): - repo_name = re.sub(r".git$","", repo_name) - origin_github_api = 'https://api.github.com/repos/{}/{}/commits/{}'.format(org_name, repo_name, rcommit.hexsha) - - else: - origin = 'git origin unknown' - # deduplication if similar commits on different branches - if rcommit.hexsha in potential_vulnerabilities: - potential_vulnerabilities[rcommit.hexsha]['branches'].append(branch) - else: - potential_vulnerabilities[rcommit.hexsha] = {} - potential_vulnerabilities[rcommit.hexsha]['message'] = rcommit.message - potential_vulnerabilities[rcommit.hexsha]['language'] = langdetect(rcommit.message) - potential_vulnerabilities[rcommit.hexsha]['commit-id'] = rcommit.hexsha - potential_vulnerabilities[rcommit.hexsha]['summary'] = rcommit.summary - potential_vulnerabilities[rcommit.hexsha]['stats'] = rcommit.stats.total - potential_vulnerabilities[rcommit.hexsha]['author'] = rcommit.author.name - potential_vulnerabilities[rcommit.hexsha]['author-email'] = rcommit.author.email - potential_vulnerabilities[rcommit.hexsha]['authored_date'] = rcommit.authored_date - potential_vulnerabilities[rcommit.hexsha]['committed_date'] = rcommit.committed_date - potential_vulnerabilities[rcommit.hexsha]['branches'] = [] - potential_vulnerabilities[rcommit.hexsha]['branches'].append(branch) - potential_vulnerabilities[rcommit.hexsha]['pattern-selected'] = pattern.pattern - potential_vulnerabilities[rcommit.hexsha]['pattern-matches'] = ret['match'] - potential_vulnerabilities[rcommit.hexsha]['origin'] = origin - if origin_github_api: - potential_vulnerabilities[commit.hexsha]['origin-github-api'] = origin_github_api - potential_vulnerabilities[rcommit.hexsha]['tags'] = [] - if args.t: - if repo.commit(rcommit).hexsha in tagmap: - potential_vulnerabilities[rcommit.hexsha]['tags'] = tagmap[repo.commit(rcommit).hexsha] - if cve: potential_vulnerabilities[rcommit.hexsha]['cve'] = cve - if cve: - potential_vulnerabilities[rcommit.hexsha]['state'] = "cve-assigned" - else: - potential_vulnerabilities[rcommit.hexsha]['state'] = args.s - - return rcommit.hexsha - -def extract_cve(commit): - cve_find = re.compile(r'CVE-[1-2]\d{1,4}-\d{1,7}', re.IGNORECASE) - m = cve_find.findall(commit) - if m: - for v in m: - cve_found.add(v) - return m - else: - return None - -repo_heads = repo.heads -repo_heads_names = [h.name for h in repo_heads] -print(repo_heads_names, file=sys.stderr) -origin = repo.remotes.origin.url -if args.t: - tagmap = {} - for t in repo.tags: - tagmap.setdefault(repo.commit(t).hexsha, []).append(str(t)) - -for branch in repo_heads_names: - commits = list(repo.iter_commits(branch)) - defaultpattern - for commit in commits: - if isinstance(defaultpattern, typing.Pattern): - ret = find_vuln(commit, pattern=defaultpattern) - if ret: - rcommit = ret['commit'] - summary(rcommit, branch, defaultpattern, origin=origin) - found += 1 - elif isinstance(defaultpattern, list): - for p in defaultpattern: - ret = find_vuln(commit, pattern=p) - if ret: - rcommit = ret['commit'] - summary(rcommit, branch, p, origin=origin) - found += 1 -if not args.c: - print(json.dumps(potential_vulnerabilities)) -elif args.c: - print(json.dumps(list(cve_found))) - -print("{} CVE referenced found in commit(s)".format(len(list(cve_found))), file=sys.stderr) -print("Total potential vulnerability found in {} commit(s)".format(found), file=sys.stderr) + # Output the result to stderr. + print( + "{} CVE referenced found in commit(s)".format(len(list(all_cve_found))), + file=sys.stderr, + ) + print( + "Total potential vulnerability found in {} commit(s)".format(found), + file=sys.stderr, + ) diff --git a/git_vuln_finder/__init__.py b/git_vuln_finder/__init__.py new file mode 100644 index 0000000..a2572bc --- /dev/null +++ b/git_vuln_finder/__init__.py @@ -0,0 +1,6 @@ +from git_vuln_finder.pattern import build_pattern +from git_vuln_finder.pattern import get_patterns +from git_vuln_finder.vulnerability import find_vuln +from git_vuln_finder.vulnerability import summary +from git_vuln_finder.vulnerability import extract_cve +from git_vuln_finder.run import find diff --git a/git_vuln_finder/pattern.py b/git_vuln_finder/pattern.py new file mode 100644 index 0000000..2d64d3c --- /dev/null +++ b/git_vuln_finder/pattern.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Finding potential software vulnerabilities from git commit messages +# +# Software is free software released under the "GNU Affero General Public License v3.0" +# +# This software is part of cve-search.org +# +# Copyright (c) 2019-2020 Alexandre Dulaunoy - a@foo.be + + +import os +import re +from collections import defaultdict + + +def tree(): + """Autovivification. + Call it a tree or call it 'patterns'. + """ + return defaultdict(tree) + + +PATTERNS_PATH = "./git_vuln_finder/patterns" + + +def build_pattern(pattern_file): + fp = open(pattern_file, "r") + rex = "" + try: + prefix_fp = open(pattern_file + ".prefix", "r") + rex += prefix_fp.read() + prefix_fp.close() + except: + pass + + for line in fp.readlines(): + rex += line.rstrip() + "|" + rex = rex[:-1] # We remove the extra '| + fp.close() + + try: + suffix_fp = open(pattern_file + ".suffix", "r") + rex += suffix_fp.read() + suffix_fp.close() + except: + pass + + return rex + + +def get_patterns(patterns_path=PATTERNS_PATH): + patterns = tree() + for root, dirs, files in os.walk(patterns_path): + path = root.split(os.sep) + for f in files: + if f.endswith(".prefix") or f.endswith(".suffix"): + continue + npath = root[len(patterns_path) :].split(os.sep) + try: + npath.remove("") + except ValueError: + pass + + lang = npath[0] + severity = npath[1] + pattern_category = f + + rex = build_pattern(root + os.sep + f) + patterns[lang][severity][pattern_category] = re.compile(rex) + + return patterns diff --git a/patterns/en/medium/c b/git_vuln_finder/patterns/en/medium/c similarity index 100% rename from patterns/en/medium/c rename to git_vuln_finder/patterns/en/medium/c diff --git a/patterns/en/medium/c.prefix b/git_vuln_finder/patterns/en/medium/c.prefix similarity index 100% rename from patterns/en/medium/c.prefix rename to git_vuln_finder/patterns/en/medium/c.prefix diff --git a/patterns/en/medium/c.suffix b/git_vuln_finder/patterns/en/medium/c.suffix similarity index 100% rename from patterns/en/medium/c.suffix rename to git_vuln_finder/patterns/en/medium/c.suffix diff --git a/patterns/en/medium/crypto b/git_vuln_finder/patterns/en/medium/crypto similarity index 100% rename from patterns/en/medium/crypto rename to git_vuln_finder/patterns/en/medium/crypto diff --git a/patterns/en/medium/crypto.prefix b/git_vuln_finder/patterns/en/medium/crypto.prefix similarity index 100% rename from patterns/en/medium/crypto.prefix rename to git_vuln_finder/patterns/en/medium/crypto.prefix diff --git a/patterns/en/medium/crypto.suffix b/git_vuln_finder/patterns/en/medium/crypto.suffix similarity index 100% rename from patterns/en/medium/crypto.suffix rename to git_vuln_finder/patterns/en/medium/crypto.suffix diff --git a/patterns/en/medium/vuln b/git_vuln_finder/patterns/en/medium/vuln similarity index 100% rename from patterns/en/medium/vuln rename to git_vuln_finder/patterns/en/medium/vuln diff --git a/patterns/en/medium/vuln.prefix b/git_vuln_finder/patterns/en/medium/vuln.prefix similarity index 100% rename from patterns/en/medium/vuln.prefix rename to git_vuln_finder/patterns/en/medium/vuln.prefix diff --git a/patterns/en/medium/vuln.suffix b/git_vuln_finder/patterns/en/medium/vuln.suffix similarity index 100% rename from patterns/en/medium/vuln.suffix rename to git_vuln_finder/patterns/en/medium/vuln.suffix diff --git a/git_vuln_finder/run.py b/git_vuln_finder/run.py new file mode 100644 index 0000000..181633b --- /dev/null +++ b/git_vuln_finder/run.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Finding potential software vulnerabilities from git commit messages +# +# Software is free software released under the "GNU Affero General Public License v3.0" +# +# This software is part of cve-search.org +# +# Copyright (c) 2019-2020 Alexandre Dulaunoy - a@foo.be + +import sys +import git +import typing +from git_vuln_finder import get_patterns, find_vuln, summary + + +def find( + repo, + tags_matching=False, + commit_state="under-review", + verbose=False, + defaultpattern="all", +): + # Initialization of the variables for the results + repo = git.Repo(repo) + found = 0 + all_potential_vulnerabilities = {} + all_cve_found = set() + + # Initialization of the patterns + patterns = get_patterns() + vulnpatterns = patterns["en"]["medium"]["vuln"] + cryptopatterns = patterns["en"]["medium"]["crypto"] + cpatterns = patterns["en"]["medium"]["c"] + + if defaultpattern == "vulnpatterns": + defaultpattern = vulnpatterns + elif defaultpattern == "cryptopatterns": + defaultpattern = cryptopatterns + elif defaultpattern == "cpatterns": + defaultpattern = cpatterns + elif defaultpattern == "all": + defaultpattern = [vulnpatterns, cryptopatterns, cpatterns] + + repo_heads = repo.heads + repo_heads_names = [h.name for h in repo_heads] + print(repo_heads_names, file=sys.stderr) + origin = repo.remotes.origin.url + tagmap = {} + if tags_matching: + for t in repo.tags: + tagmap.setdefault(repo.commit(t).hexsha, []).append(str(t)) + + for branch in repo_heads_names: + commits = list(repo.iter_commits(branch)) + defaultpattern + for commit in commits: + if isinstance(defaultpattern, typing.Pattern): + ret = find_vuln(commit, pattern=defaultpattern, verbose=verbose) + if ret: + rcommit = ret["commit"] + _, potential_vulnerabilities, cve_found = summary( + repo, + rcommit, + branch, + tagmap, + defaultpattern, + origin=origin, + vuln_match=ret["match"], + tags_matching=tags_matching, + commit_state=commit_state, + ) + all_potential_vulnerabilities.update(potential_vulnerabilities) + all_cve_found.update(cve_found) + found += 1 + elif isinstance(defaultpattern, list): + for p in defaultpattern: + ret = find_vuln(commit, pattern=p, verbose=verbose) + if ret: + rcommit = ret["commit"] + _, potential_vulnerabilities, cve_found = summary( + repo, + rcommit, + branch, + tagmap, + p, + origin=origin, + vuln_match=ret["match"], + tags_matching=tags_matching, + commit_state=commit_state, + ) + all_potential_vulnerabilities.update(potential_vulnerabilities) + all_cve_found.update(cve_found) + found += 1 + + return all_potential_vulnerabilities, all_cve_found, found diff --git a/git_vuln_finder/vulnerability.py b/git_vuln_finder/vulnerability.py new file mode 100644 index 0000000..4581618 --- /dev/null +++ b/git_vuln_finder/vulnerability.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Finding potential software vulnerabilities from git commit messages +# +# Software is free software released under the "GNU Affero General Public License v3.0" +# +# This software is part of cve-search.org +# +# Copyright (c) 2019-2020 Alexandre Dulaunoy - a@foo.be + +import re +import sys +from langdetect import detect as langdetect + + +def find_vuln(commit, pattern, verbose=False): + """Find a potential vulnerability from a commit message thanks to a regex + pattern. + """ + m = pattern.search(commit.message) + if m: + if verbose: + print("Match found: {}".format(m.group(0)), file=sys.stderr) + print(commit.message, file=sys.stderr) + print("---", file=sys.stderr) + ret = {} + ret["commit"] = commit + ret["match"] = m.groups() + return ret + else: + return None + + +def summary( + repo, + commit, + branch, + tagmap, + pattern, + origin=None, + vuln_match=None, + tags_matching=False, + commit_state="under-review", +): + potential_vulnerabilities = {} + rcommit = commit + cve, cve_found = extract_cve(rcommit.message) + if origin is not None: + origin = origin + if origin.find("github.com"): + origin_github_api = origin.split(":")[1] + (org_name, repo_name) = origin_github_api.split("/", 1) + if repo_name.find(".git$"): + repo_name = re.sub(r".git$", "", repo_name) + origin_github_api = "https://api.github.com/repos/{}/{}/commits/{}".format( + org_name, repo_name, rcommit.hexsha + ) + + else: + origin = "git origin unknown" + # deduplication if similar commits on different branches + if rcommit.hexsha in potential_vulnerabilities: + potential_vulnerabilities[rcommit.hexsha]["branches"].append(branch) + else: + potential_vulnerabilities[rcommit.hexsha] = {} + potential_vulnerabilities[rcommit.hexsha]["message"] = rcommit.message + potential_vulnerabilities[rcommit.hexsha]["language"] = langdetect( + rcommit.message + ) + potential_vulnerabilities[rcommit.hexsha]["commit-id"] = rcommit.hexsha + potential_vulnerabilities[rcommit.hexsha]["summary"] = rcommit.summary + potential_vulnerabilities[rcommit.hexsha]["stats"] = rcommit.stats.total + potential_vulnerabilities[rcommit.hexsha]["author"] = rcommit.author.name + potential_vulnerabilities[rcommit.hexsha]["author-email"] = rcommit.author.email + potential_vulnerabilities[rcommit.hexsha][ + "authored_date" + ] = rcommit.authored_date + potential_vulnerabilities[rcommit.hexsha][ + "committed_date" + ] = rcommit.committed_date + potential_vulnerabilities[rcommit.hexsha]["branches"] = [] + potential_vulnerabilities[rcommit.hexsha]["branches"].append(branch) + potential_vulnerabilities[rcommit.hexsha]["pattern-selected"] = pattern.pattern + potential_vulnerabilities[rcommit.hexsha]["pattern-matches"] = vuln_match + potential_vulnerabilities[rcommit.hexsha]["origin"] = origin + if origin_github_api: + potential_vulnerabilities[commit.hexsha][ + "origin-github-api" + ] = origin_github_api + potential_vulnerabilities[rcommit.hexsha]["tags"] = [] + if tags_matching: + if repo.commit(rcommit).hexsha in tagmap: + potential_vulnerabilities[rcommit.hexsha]["tags"] = tagmap[ + repo.commit(rcommit).hexsha + ] + if cve: + potential_vulnerabilities[rcommit.hexsha]["cve"] = cve + potential_vulnerabilities[rcommit.hexsha]["state"] = "cve-assigned" + else: + potential_vulnerabilities[rcommit.hexsha]["state"] = commit_state + + return rcommit.hexsha, potential_vulnerabilities, cve_found + + +def extract_cve(commit): + cve_found = set() + cve_find = re.compile(r"CVE-[1-2]\d{1,4}-\d{1,7}", re.IGNORECASE) + m = cve_find.findall(commit) + if m: + for v in m: + cve_found.add(v) + return m, cve_found + else: + return None, set() diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..3d54d35 --- /dev/null +++ b/poetry.lock @@ -0,0 +1,336 @@ +[[package]] +category = "dev" +description = "Atomic file writes." +marker = "sys_platform == \"win32\"" +name = "atomicwrites" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "1.3.0" + +[[package]] +category = "dev" +description = "Classes Without Boilerplate" +name = "attrs" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "19.3.0" + +[package.extras] +azure-pipelines = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "pytest-azurepipelines"] +dev = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "sphinx", "pre-commit"] +docs = ["sphinx", "zope.interface"] +tests = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface"] + +[[package]] +category = "dev" +description = "Cross-platform colored terminal text." +marker = "sys_platform == \"win32\"" +name = "colorama" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +version = "0.4.3" + +[[package]] +category = "dev" +description = "Discover and load entry points from installed packages." +name = "entrypoints" +optional = false +python-versions = ">=2.7" +version = "0.3" + +[[package]] +category = "dev" +description = "the modular source code checker: pep8, pyflakes and co" +name = "flake8" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "3.7.9" + +[package.dependencies] +entrypoints = ">=0.3.0,<0.4.0" +mccabe = ">=0.6.0,<0.7.0" +pycodestyle = ">=2.5.0,<2.6.0" +pyflakes = ">=2.1.0,<2.2.0" + +[[package]] +category = "main" +description = "Git Object Database" +name = "gitdb2" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "2.0.6" + +[package.dependencies] +smmap2 = ">=2.0.0" + +[[package]] +category = "main" +description = "Python Git Library" +name = "gitpython" +optional = false +python-versions = ">=3.0, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "3.0.5" + +[package.dependencies] +gitdb2 = ">=2.0.0" + +[[package]] +category = "dev" +description = "Read metadata from Python packages" +marker = "python_version < \"3.8\"" +name = "importlib-metadata" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" +version = "1.3.0" + +[package.dependencies] +zipp = ">=0.5" + +[package.extras] +docs = ["sphinx", "rst.linker"] +testing = ["packaging", "importlib-resources"] + +[[package]] +category = "main" +description = "Language detection library ported from Google's language-detection." +name = "langdetect" +optional = false +python-versions = "*" +version = "1.0.7" + +[package.dependencies] +six = "*" + +[[package]] +category = "dev" +description = "McCabe checker, plugin for flake8" +name = "mccabe" +optional = false +python-versions = "*" +version = "0.6.1" + +[[package]] +category = "dev" +description = "More routines for operating on iterables, beyond itertools" +name = "more-itertools" +optional = false +python-versions = ">=3.5" +version = "8.0.2" + +[[package]] +category = "dev" +description = "Core utilities for Python packages" +name = "packaging" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "20.0" + +[package.dependencies] +pyparsing = ">=2.0.2" +six = "*" + +[[package]] +category = "dev" +description = "plugin and hook calling mechanisms for python" +name = "pluggy" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "0.13.1" + +[package.dependencies] +[package.dependencies.importlib-metadata] +python = "<3.8" +version = ">=0.12" + +[package.extras] +dev = ["pre-commit", "tox"] + +[[package]] +category = "dev" +description = "library with cross-python path, ini-parsing, io, code, log facilities" +name = "py" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "1.8.1" + +[[package]] +category = "dev" +description = "Python style guide checker" +name = "pycodestyle" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "2.5.0" + +[[package]] +category = "dev" +description = "passive checker of Python programs" +name = "pyflakes" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "2.1.1" + +[[package]] +category = "dev" +description = "Python parsing module" +name = "pyparsing" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +version = "2.4.6" + +[[package]] +category = "dev" +description = "pytest: simple powerful testing with Python" +name = "pytest" +optional = false +python-versions = ">=3.5" +version = "5.3.2" + +[package.dependencies] +atomicwrites = ">=1.0" +attrs = ">=17.4.0" +colorama = "*" +more-itertools = ">=4.0.0" +packaging = "*" +pluggy = ">=0.12,<1.0" +py = ">=1.5.0" +wcwidth = "*" + +[package.dependencies.importlib-metadata] +python = "<3.8" +version = ">=0.12" + +[package.extras] +testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] + +[[package]] +category = "main" +description = "Python 2 and 3 compatibility utilities" +name = "six" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*" +version = "1.13.0" + +[[package]] +category = "main" +description = "A pure Python implementation of a sliding window memory map manager" +name = "smmap2" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "2.0.5" + +[[package]] +category = "dev" +description = "Measures number of Terminal column cells of wide-character codes" +name = "wcwidth" +optional = false +python-versions = "*" +version = "0.1.8" + +[[package]] +category = "dev" +description = "Backport of pathlib-compatible object wrapper for zip files" +marker = "python_version < \"3.8\"" +name = "zipp" +optional = false +python-versions = ">=2.7" +version = "0.6.0" + +[package.dependencies] +more-itertools = "*" + +[package.extras] +docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] +testing = ["pathlib2", "contextlib2", "unittest2"] + +[metadata] +content-hash = "9faea1409b72575568ba911a1729a6cc7994603342a0d5ae7b045ed15cd091f8" +python-versions = "^3.6" + +[metadata.files] +atomicwrites = [ + {file = "atomicwrites-1.3.0-py2.py3-none-any.whl", hash = "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4"}, + {file = "atomicwrites-1.3.0.tar.gz", hash = "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"}, +] +attrs = [ + {file = "attrs-19.3.0-py2.py3-none-any.whl", hash = "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c"}, + {file = "attrs-19.3.0.tar.gz", hash = "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72"}, +] +colorama = [ + {file = "colorama-0.4.3-py2.py3-none-any.whl", hash = "sha256:7d73d2a99753107a36ac6b455ee49046802e59d9d076ef8e47b61499fa29afff"}, + {file = "colorama-0.4.3.tar.gz", hash = "sha256:e96da0d330793e2cb9485e9ddfd918d456036c7149416295932478192f4436a1"}, +] +entrypoints = [ + {file = "entrypoints-0.3-py2.py3-none-any.whl", hash = "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19"}, + {file = "entrypoints-0.3.tar.gz", hash = "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451"}, +] +flake8 = [ + {file = "flake8-3.7.9-py2.py3-none-any.whl", hash = "sha256:49356e766643ad15072a789a20915d3c91dc89fd313ccd71802303fd67e4deca"}, + {file = "flake8-3.7.9.tar.gz", hash = "sha256:45681a117ecc81e870cbf1262835ae4af5e7a8b08e40b944a8a6e6b895914cfb"}, +] +gitdb2 = [ + {file = "gitdb2-2.0.6-py2.py3-none-any.whl", hash = "sha256:96bbb507d765a7f51eb802554a9cfe194a174582f772e0d89f4e87288c288b7b"}, + {file = "gitdb2-2.0.6.tar.gz", hash = "sha256:1b6df1433567a51a4a9c1a5a0de977aa351a405cc56d7d35f3388bad1f630350"}, +] +gitpython = [ + {file = "GitPython-3.0.5-py3-none-any.whl", hash = "sha256:c155c6a2653593ccb300462f6ef533583a913e17857cfef8fc617c246b6dc245"}, + {file = "GitPython-3.0.5.tar.gz", hash = "sha256:9c2398ffc3dcb3c40b27324b316f08a4f93ad646d5a6328cafbb871aa79f5e42"}, +] +importlib-metadata = [ + {file = "importlib_metadata-1.3.0-py2.py3-none-any.whl", hash = "sha256:d95141fbfa7ef2ec65cfd945e2af7e5a6ddbd7c8d9a25e66ff3be8e3daf9f60f"}, + {file = "importlib_metadata-1.3.0.tar.gz", hash = "sha256:073a852570f92da5f744a3472af1b61e28e9f78ccf0c9117658dc32b15de7b45"}, +] +langdetect = [ + {file = "langdetect-1.0.7.zip", hash = "sha256:91a170d5f0ade380db809b3ba67f08e95fe6c6c8641f96d67a51ff7e98a9bf30"}, +] +mccabe = [ + {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"}, + {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, +] +more-itertools = [ + {file = "more-itertools-8.0.2.tar.gz", hash = "sha256:b84b238cce0d9adad5ed87e745778d20a3f8487d0f0cb8b8a586816c7496458d"}, + {file = "more_itertools-8.0.2-py3-none-any.whl", hash = "sha256:c833ef592a0324bcc6a60e48440da07645063c453880c9477ceb22490aec1564"}, +] +packaging = [ + {file = "packaging-20.0-py2.py3-none-any.whl", hash = "sha256:aec3fdbb8bc9e4bb65f0634b9f551ced63983a529d6a8931817d52fdd0816ddb"}, + {file = "packaging-20.0.tar.gz", hash = "sha256:fe1d8331dfa7cc0a883b49d75fc76380b2ab2734b220fbb87d774e4fd4b851f8"}, +] +pluggy = [ + {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"}, + {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"}, +] +py = [ + {file = "py-1.8.1-py2.py3-none-any.whl", hash = "sha256:c20fdd83a5dbc0af9efd622bee9a5564e278f6380fffcacc43ba6f43db2813b0"}, + {file = "py-1.8.1.tar.gz", hash = "sha256:5e27081401262157467ad6e7f851b7aa402c5852dbcb3dae06768434de5752aa"}, +] +pycodestyle = [ + {file = "pycodestyle-2.5.0-py2.py3-none-any.whl", hash = "sha256:95a2219d12372f05704562a14ec30bc76b05a5b297b21a5dfe3f6fac3491ae56"}, + {file = "pycodestyle-2.5.0.tar.gz", hash = "sha256:e40a936c9a450ad81df37f549d676d127b1b66000a6c500caa2b085bc0ca976c"}, +] +pyflakes = [ + {file = "pyflakes-2.1.1-py2.py3-none-any.whl", hash = "sha256:17dbeb2e3f4d772725c777fabc446d5634d1038f234e77343108ce445ea69ce0"}, + {file = "pyflakes-2.1.1.tar.gz", hash = "sha256:d976835886f8c5b31d47970ed689944a0262b5f3afa00a5a7b4dc81e5449f8a2"}, +] +pyparsing = [ + {file = "pyparsing-2.4.6-py2.py3-none-any.whl", hash = "sha256:c342dccb5250c08d45fd6f8b4a559613ca603b57498511740e65cd11a2e7dcec"}, + {file = "pyparsing-2.4.6.tar.gz", hash = "sha256:4c830582a84fb022400b85429791bc551f1f4871c33f23e44f353119e92f969f"}, +] +pytest = [ + {file = "pytest-5.3.2-py3-none-any.whl", hash = "sha256:e41d489ff43948babd0fad7ad5e49b8735d5d55e26628a58673c39ff61d95de4"}, + {file = "pytest-5.3.2.tar.gz", hash = "sha256:6b571215b5a790f9b41f19f3531c53a45cf6bb8ef2988bc1ff9afb38270b25fa"}, +] +six = [ + {file = "six-1.13.0-py2.py3-none-any.whl", hash = "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd"}, + {file = "six-1.13.0.tar.gz", hash = "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66"}, +] +smmap2 = [ + {file = "smmap2-2.0.5-py2.py3-none-any.whl", hash = "sha256:0555a7bf4df71d1ef4218e4807bbf9b201f910174e6e08af2e138d4e517b4dde"}, + {file = "smmap2-2.0.5.tar.gz", hash = "sha256:29a9ffa0497e7f2be94ca0ed1ca1aa3cd4cf25a1f6b4f5f87f74b46ed91d609a"}, +] +wcwidth = [ + {file = "wcwidth-0.1.8-py2.py3-none-any.whl", hash = "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603"}, + {file = "wcwidth-0.1.8.tar.gz", hash = "sha256:f28b3e8a6483e5d49e7f8949ac1a78314e740333ae305b4ba5defd3e74fb37a8"}, +] +zipp = [ + {file = "zipp-0.6.0-py2.py3-none-any.whl", hash = "sha256:f06903e9f1f43b12d371004b4ac7b06ab39a44adc747266928ae6debfa7b3335"}, + {file = "zipp-0.6.0.tar.gz", hash = "sha256:3718b1cbcd963c7d4c5511a8240812904164b7f381b647143a89d3b98f9bcd8e"}, +] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..d853e15 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,60 @@ +[tool.poetry] +name = "git-vuln-finder" +version = "1.0.0" +description = "Finding potential software vulnerabilities from git commit messages." +authors = [ + "Alexandre Dulaunoy " +] +license = "AGPL-3.0-or-later" + +readme = "README.md" + +homepage = "https://github.com/cve-search/git-vuln-finder" +repository = "https://github.com/cve-search/git-vuln-finder" +documentation = "" + +keywords = [ + "git", + "cve", + "scanner", + "cve-search", + "cve-scanning", + "software-vulnerability", + "software-vulnerabilities" +] + +classifiers = [ + "Development Status :: 4 - Beta Copy", + "Environment :: Console", + "Intended Audience :: Developers", + "Intended Audience :: Information Technology", + "Intended Audience :: Science/Research", + "Topic :: Security", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)" +] + +include = [ + "AUTHORS", + "COPYING", + "bin/*" +] + +[tool.poetry.scripts] +git-vuln-finder = "bin.finder:main" + +[tool.poetry.dependencies] +python = "^3.6" +langdetect = "^1.0.7" +gitpython = "^3.0.5" + +[tool.poetry.dev-dependencies] +flake8 = "^3.7.9" +pytest = "^5.3.2" + +[build-system] +requires = ["poetry>=0.12"] +build-backend = "poetry.masonry.api" diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..e37d33b --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,19 @@ + +import os +import pytest + +from git import Repo + + +@pytest.fixture(scope='session') +def clone_curl(): + """Clone the repository of curl for the tests.""" + git_url = 'https://github.com/curl/curl.git' + repo_dir = './test_repos/curl' + repo = Repo.clone_from(url=git_url, to_path=repo_dir) + #repo.heads['curl-7_67_0'].checkout() + + def teardown(): + os.unlink(repo_dir) + + return repo_dir diff --git a/tests/test_finder.py b/tests/test_finder.py new file mode 100644 index 0000000..4625ea2 --- /dev/null +++ b/tests/test_finder.py @@ -0,0 +1,10 @@ + + +from git_vuln_finder import find + + +def test_find_vuln(clone_curl): + all_potential_vulnerabilities, all_cve_found, found = find("./test_repos/curl/") + + #assert len(list(all_cve_found)) == 64 + assert "CVE-2018-1000122" in all_cve_found