From ca910ff22f2aed70b811039b1ab396789316a374 Mon Sep 17 00:00:00 2001 From: Koen Van Impe Date: Thu, 29 Jul 2021 10:46:59 +0200 Subject: [PATCH] Support for import of NSRL datasets in ISO and ZIP format --- bin/import-poc/README.md | 30 +++- bin/import-poc/config.json | 11 +- bin/import-poc/import-hashlookup-server.py | 156 +++++++++++++++++---- 3 files changed, 165 insertions(+), 32 deletions(-) diff --git a/bin/import-poc/README.md b/bin/import-poc/README.md index d3ee74c..d98e7ac 100644 --- a/bin/import-poc/README.md +++ b/bin/import-poc/README.md @@ -2,9 +2,33 @@ PoC to better streamline the import of NSRL data. -Todo: +## Usage -- Test with the other data sets (currently only Android was tested) : Fetch from ZIP and not ISO file +``` +$ python3 import-hashlookup-server.py -h +usage: import-hashlookup-server.py [-h] [-l | -i IMPORT_DATASET | -e INIT_DATASET] [-d] [-c] + +optional arguments: + -h, --help show this help message and exit + -l, --list List datasets available for download and import. + -i IMPORT_DATASET, --import-dataset IMPORT_DATASET + Import a dataset. + -e INIT_DATASET, --init-dataset INIT_DATASET + Remove / initialize a dataset. + -d, --skip-download Skip downloading the dataset. + -c, --skip-init Skip initialization of the database. +``` + +``` +$ python3 import-hashlookup-server.py -i nsrl_minimal +``` + +## Todo + + +- ~~Test with the other data sets (currently only Android was tested) : Fetch from ZIP and not ISO file~~ +- Move older input scripts to "old" directory +- Complete with sha256 and xcycl - Error handling (sufficient drive space, Redis active, check if there is already a db before init) - Multiple data sets at once? -- Import from MISP (depends on filter) +- Import from MISP (depends on filter) \ No newline at end of file diff --git a/bin/import-poc/config.json b/bin/import-poc/config.json index 1af0762..1746734 100644 --- a/bin/import-poc/config.json +++ b/bin/import-poc/config.json @@ -4,15 +4,20 @@ "nsrl_minimal": { "description": "(minimal) - contains the set of DISTINCT appearances of files in modern applications; no file entries are duplicated", "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernm.zip"}, "nsrl_android": { "description": "Contains modern Android mobile applications", - "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_android.iso"} + "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_android.iso"}, + "nsrl_unique": { "description": "Contains the set of file entries that appear ONLY ONCE in the entire NSRL collection; these are unique to each off the applications that are in the collection", + "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernu.zip"}, + "nsrl_ios": { "description": "IOS dataset", + "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_ios.iso"} }, "local_path": "/home/koenv/nsrl/", "import": { "max_value": 500000000, - "mod_lines": 1000 + "mod_lines": 2500 }, "redis": { "hostname": "127.0.0.1", - "port": 6666 + "port": 6666, + "flushdb_on_init": 1 } } \ No newline at end of file diff --git a/bin/import-poc/import-hashlookup-server.py b/bin/import-poc/import-hashlookup-server.py index 9a6b16e..67c4e72 100644 --- a/bin/import-poc/import-hashlookup-server.py +++ b/bin/import-poc/import-hashlookup-server.py @@ -5,6 +5,8 @@ import wget import sys import redis import json +import time +import argparse class import_hash: @@ -17,10 +19,14 @@ class import_hash: self.local_path = data["local_path"] redis_host = data["redis"]["hostname"] redis_port = data["redis"]["port"] + self.flushrdb = data["redis"]["flushdb_on_init"] self.rdb = redis.Redis(host=redis_host, port=redis_port, decode_responses=True) def download(self, dataset=False): + """ Download a dataset + :param dataset: The dataset to use. This is a key looked for in the config.json file to get the correct download URL + """ if not dataset: self.error("no dataset") @@ -28,12 +34,21 @@ class import_hash: wget.download(self.hash_datasets[dataset]["url"], self.local_path) print("\nDownload completed.") - def __process_nsrl_txt(self, isofile, dataset_file, key1, key2): - + def __process_nsrl_support(self, isofile, dataset_file, key): + """ Process support NSRL data (OS, Product, Vendor/Manufacturer) + :param isofile: The object to the ISO file. When set to False this indicates the NSRL is provided in a ZIP format + :param dataset_file: The location of the dataset, is either a path in the ISO or a direct filepath + :param key: type of support NSRL data + """ print("\n Work on {0}".format(dataset_file)) + if isofile: + with_element = isofile.IsoPath("/{0}".format(dataset_file)).open() + else: + with_element = open(dataset_file, encoding='utf-8') + ln = 0 - with isofile.IsoPath("/{0}".format(dataset_file)).open() as f: + with with_element as f: while True: l = f.readline() @@ -51,9 +66,10 @@ class import_hash: except: continue - self.rdb.sadd("s:{0}".format(key1), drecords[key1]) - self.rdb.hmset("h-{0}:{1}".format(key1, drecords[key1]), drecords) - self.rdb.incrby("stat:{0}-import".format(key2)) + self.rdb.sadd("s:{0}".format(key), drecords[key]) + self.rdb.hmset("h-{0}:{1}".format(key, drecords[key]), drecords) + stat_import_key = dataset_file[dataset_file.rfind("/")+1:dataset_file.rfind(".txt")] + self.rdb.incrby("stat:{0}-import".format(stat_import_key)) if ln % self.mod_lines == 0: print(" Imported {0} records.".format(ln)) @@ -63,19 +79,32 @@ class import_hash: ln = ln + 1 print(" Finished, importing {0} records.".format(ln)) - def __process_nsrl_zip(self, isofile, dataset_file, key): + def __process_nsrl_base(self, isofile, dataset_file, rdbkey): + """ Process base NSRL data (file hashes) + :param isofile: The object to the ISO file. When set to False this indicates the NSRL is provided in a ZIP format + :param dataset_file: The location of the dataset, is either a path in the ISO or a direct filepath + :param rdbkey: redis database key (corresponds with key of dataset in config.json) + """ print("\n Work on {0}".format(dataset_file)) - # First get the ZIP from the ISO and then extract the ZIP - zip_f = open(self.local_path + dataset_file, "wb") - with isofile.IsoPath("/{0}".format(dataset_file)).open("rb") as f: - zip_f.write((f.read())) - zip_f.close() - zip_f = zipfile.ZipFile(self.local_path + dataset_file) - zip_f.extractall(self.local_path) - + if isofile: + # We received the NSRL dataset as an ISO file + # First get the ZIP from the ISO and then extract the ZIP + zip_f = open(self.local_path + dataset_file, "wb") + with isofile.IsoPath("/{0}".format(dataset_file)).open("rb") as f: + zip_f.write((f.read())) + zip_f.close() + zip_f = zipfile.ZipFile(self.local_path + dataset_file) + zip_f.extractall(self.local_path) + + local_dataset_file = self.local_path + "NSRLFile.txt" + else: + # No need to do additional actions + # We probably received the NSRL as a ZIP file + local_dataset_file = dataset_file + ln = 0 - lines = open(self.local_path + "NSRLFile.txt", "r") + lines = open(local_dataset_file, "r") for l in lines: if ln == 0: @@ -89,9 +118,15 @@ class import_hash: except: continue + # Add some meta data + drecords['source'] = "NSRL" + drecords['db'] = rdbkey + drecords['insert-timestamp'] = time.time() + + # Base records self.rdb.set("l:{}".format(drecords['MD5']), drecords['SHA-1']) self.rdb.hmset("h:{}".format(drecords['SHA-1']), drecords) - self.rdb.incrby("stat:{0}".format(key)) + self.rdb.incrby("stat:{0}".format(rdbkey)) if ln % self.mod_lines == 0: print(" Imported {0} records.".format(ln)) @@ -101,6 +136,9 @@ class import_hash: print(" Finished, importing {0} records.".format(ln)) def process(self, dataset=False): + """Process a dataset + :param dataset: The dataset to process + """ if not dataset: self.error("no dataset") @@ -115,28 +153,94 @@ class import_hash: dataset_file_type = local_dataset[local_dataset.rfind(".")+1:] if dataset_file_type == "iso": + # We read directly from the ISO file isofile = pathlab.IsoAccessor(local_dataset) - self.__process_nsrl_zip(isofile, "NSRLFILE.ZIP", "NSRLAndroid") - self.__process_nsrl_txt(isofile, "NSRLMFG.TXT", "MfgCode", "NSRLMfg") - self.__process_nsrl_txt(isofile, "NSRLOS.TXT", "OpSystemCode", "NSRLOS") - self.__process_nsrl_txt(isofile, "NSRLPROD.TXT", "ProductCode", "NSRLProd") + self.__process_nsrl_base(isofile, "NSRLFILE.ZIP", dataset) + self.__process_nsrl_support(isofile, "NSRLMFG.TXT", "MfgCode") + self.__process_nsrl_support(isofile, "NSRLOS.TXT", "OpSystemCode") + self.__process_nsrl_support(isofile, "NSRLPROD.TXT", "ProductCode") + elif dataset_file_type == "zip": + # Extract the ZIP + zip_f = zipfile.ZipFile(local_dataset) + zip_f.extractall(self.local_path) + # NSRL ZIPs store the datafiles in a subdirectory + namelist_first = zip_f.namelist()[0] + zip_extract_path = "" + if namelist_first[-1] == "/": + zip_extract_path = self.local_path + namelist_first + # Indicate we don't have an ISO object + isofile = False + + self.__process_nsrl_base(isofile, zip_extract_path + "NSRLFile.txt", dataset) + self.__process_nsrl_support(isofile, zip_extract_path + "NSRLMfg.txt", "MfgCode") + self.__process_nsrl_support(isofile, zip_extract_path + "NSRLOS.txt", "OpSystemCode") + self.__process_nsrl_support(isofile, zip_extract_path + "NSRLProd.txt", "ProductCode") def init(self, dataset=False): + """ Remove / Initialize a dataset + :param dataset: Affected dataset + """ if not dataset: self.error("no dataset") print("**INIT** dataset {0} .".format(dataset)) - self.rdb.delete("stat:{0}".format(dataset)) - self.rdb.set("stat:{0}".format(dataset), 0) + if self.flushrdb: + self.rdb.flushdb() + else: + self.rdb.delete("stat:{0}".format(dataset)) + self.rdb.set("stat:{0}".format(dataset), 0) + + def datasetlist(self): + """ List the available datasets + """ + for nsrl in self.hash_datasets: + print("{0}\n {1}\n from: {2}\n".format(nsrl, self.hash_datasets[nsrl]["description"], self.hash_datasets[nsrl]["url"])) + + def valid_dataset(self, dataset): + """ Verify if the datset exist + :param dataset: Affected dataset + """ + if dataset in self.hash_datasets: + return True + else: + return False def error(self, error): + """ Return an error message and exit + :param error: Error message + """ print("!!ERROR!! {0}".format(error)) sys.exit() +parser = argparse.ArgumentParser() +group = parser.add_mutually_exclusive_group() +group.add_argument("-l", "--list", action="store_true", help="List datasets available for download and import.") +group.add_argument("-i", "--import-dataset", help="Import a dataset.") +group.add_argument("-e", "--init-dataset", help="Remove / initialize a dataset.") +parser.add_argument("-d", "--skip-download", action="store_true", help="Skip downloading the dataset.") +parser.add_argument("-c", "--skip-init", action="store_true", help="Skip initialization of the database.") +args = parser.parse_args() + import_hash = import_hash() -#import_hash.download(dataset="nsrl_android") -import_hash.init(dataset="nsrl_android") -import_hash.process(dataset="nsrl_android") + +if args.list: + import_hash.datasetlist() +elif args.import_dataset: + dataset = args.import_dataset + if import_hash.valid_dataset(dataset): + if not args.skip_download: + import_hash.download(dataset=dataset) + if not args.skip_init: + import_hash.init(dataset=dataset) + import_hash.process(dataset=dataset) + else: + print("Dataset not found.") +elif args.init_dataset: + dataset = args.init_dataset + if import_hash.valid_dataset(dataset): + import_hash.init(dataset=dataset) + else: + print("Dataset not found.")