From 0edb33ed82c75c880d61ec658ff7416ba2e892d2 Mon Sep 17 00:00:00 2001 From: Koen Van Impe Date: Tue, 27 Jul 2021 18:37:12 +0200 Subject: [PATCH 1/2] PoC for streamlining import PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android --- bin/import-poc/README.md | 10 ++ bin/import-poc/config.json | 18 +++ bin/import-poc/import-hashlookup-server.py | 142 +++++++++++++++++++++ bin/import-poc/requirements | 5 + 4 files changed, 175 insertions(+) create mode 100644 bin/import-poc/README.md create mode 100644 bin/import-poc/config.json create mode 100644 bin/import-poc/import-hashlookup-server.py create mode 100644 bin/import-poc/requirements diff --git a/bin/import-poc/README.md b/bin/import-poc/README.md new file mode 100644 index 0000000..d3ee74c --- /dev/null +++ b/bin/import-poc/README.md @@ -0,0 +1,10 @@ +# hashlookup-server + +PoC to better streamline the import of NSRL data. + +Todo: + +- Test with the other data sets (currently only Android was tested) : Fetch from ZIP and not ISO file +- Error handling (sufficient drive space, Redis active, check if there is already a db before init) +- Multiple data sets at once? +- Import from MISP (depends on filter) diff --git a/bin/import-poc/config.json b/bin/import-poc/config.json new file mode 100644 index 0000000..1af0762 --- /dev/null +++ b/bin/import-poc/config.json @@ -0,0 +1,18 @@ +{ "nsrl_downloads": { + "nsrl_modern_rds": { "description": "(microcomputer applications) - contains the comprehensive set of ALL appearances of files in modern applications; many file entries are duplicated", + "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_modern.iso"}, + "nsrl_minimal": { "description": "(minimal) - contains the set of DISTINCT appearances of files in modern applications; no file entries are duplicated", + "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernm.zip"}, + "nsrl_android": { "description": "Contains modern Android mobile applications", + "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_android.iso"} + }, + "local_path": "/home/koenv/nsrl/", + "import": { + "max_value": 500000000, + "mod_lines": 1000 + }, + "redis": { + "hostname": "127.0.0.1", + "port": 6666 + } +} \ No newline at end of file diff --git a/bin/import-poc/import-hashlookup-server.py b/bin/import-poc/import-hashlookup-server.py new file mode 100644 index 0000000..9a6b16e --- /dev/null +++ b/bin/import-poc/import-hashlookup-server.py @@ -0,0 +1,142 @@ +from pathlib import Path +import pathlab +import zipfile +import wget +import sys +import redis +import json + + +class import_hash: + def __init__(self): + with open('config.json') as config_file: + data = json.load(config_file) + self.hash_datasets = data["nsrl_downloads"] + self.max_value = data["import"]["max_value"] + self.mod_lines = data["import"]["mod_lines"] + self.local_path = data["local_path"] + redis_host = data["redis"]["hostname"] + redis_port = data["redis"]["port"] + + self.rdb = redis.Redis(host=redis_host, port=redis_port, decode_responses=True) + + def download(self, dataset=False): + if not dataset: + self.error("no dataset") + + print("**DOWNLOAD** dataset {0} from {1} to {2} .".format(dataset, self.hash_datasets[dataset]["url"], self.local_path)) + wget.download(self.hash_datasets[dataset]["url"], self.local_path) + print("\nDownload completed.") + + def __process_nsrl_txt(self, isofile, dataset_file, key1, key2): + + print("\n Work on {0}".format(dataset_file)) + + ln = 0 + with isofile.IsoPath("/{0}".format(dataset_file)).open() as f: + while True: + l = f.readline() + + if not l: + break + + if ln == 0: + headers = l.rstrip().replace("\"", "").split(",") + else: + records = l.rstrip().replace("\"", "").split(",") + drecords = {} + for index, value in enumerate(records): + try: + drecords[headers[index]] = value + except: + continue + + self.rdb.sadd("s:{0}".format(key1), drecords[key1]) + self.rdb.hmset("h-{0}:{1}".format(key1, drecords[key1]), drecords) + self.rdb.incrby("stat:{0}-import".format(key2)) + if ln % self.mod_lines == 0: + print(" Imported {0} records.".format(ln)) + + if ln == self.max_value: + break + + ln = ln + 1 + print(" Finished, importing {0} records.".format(ln)) + + def __process_nsrl_zip(self, isofile, dataset_file, key): + print("\n Work on {0}".format(dataset_file)) + + # First get the ZIP from the ISO and then extract the ZIP + zip_f = open(self.local_path + dataset_file, "wb") + with isofile.IsoPath("/{0}".format(dataset_file)).open("rb") as f: + zip_f.write((f.read())) + zip_f.close() + zip_f = zipfile.ZipFile(self.local_path + dataset_file) + zip_f.extractall(self.local_path) + + ln = 0 + lines = open(self.local_path + "NSRLFile.txt", "r") + + for l in lines: + if ln == 0: + headers = l.rstrip().replace("\"", "").split(",") + else: + records = l.rstrip().replace("\"", "").split(",") + drecords = {} + for index, value in enumerate(records): + try: + drecords[headers[index]] = value + except: + continue + + self.rdb.set("l:{}".format(drecords['MD5']), drecords['SHA-1']) + self.rdb.hmset("h:{}".format(drecords['SHA-1']), drecords) + self.rdb.incrby("stat:{0}".format(key)) + if ln % self.mod_lines == 0: + print(" Imported {0} records.".format(ln)) + + if ln == self.max_value: + break + ln = ln + 1 + print(" Finished, importing {0} records.".format(ln)) + + def process(self, dataset=False): + if not dataset: + self.error("no dataset") + + local_dataset = self.local_path + self.hash_datasets[dataset]["url"][self.hash_datasets[dataset]["url"].rfind("/")+1:] + local_dataset.lower() + print("**PROCESS** dataset {0} from location {1} .".format(dataset, local_dataset)) + + if not Path(local_dataset).is_file(): + self.error("Cannot find file {0}".format(local_dataset)) + + # Determine dataset file type + dataset_file_type = local_dataset[local_dataset.rfind(".")+1:] + + if dataset_file_type == "iso": + isofile = pathlab.IsoAccessor(local_dataset) + + self.__process_nsrl_zip(isofile, "NSRLFILE.ZIP", "NSRLAndroid") + self.__process_nsrl_txt(isofile, "NSRLMFG.TXT", "MfgCode", "NSRLMfg") + self.__process_nsrl_txt(isofile, "NSRLOS.TXT", "OpSystemCode", "NSRLOS") + self.__process_nsrl_txt(isofile, "NSRLPROD.TXT", "ProductCode", "NSRLProd") + + def init(self, dataset=False): + if not dataset: + self.error("no dataset") + + print("**INIT** dataset {0} .".format(dataset)) + + self.rdb.delete("stat:{0}".format(dataset)) + self.rdb.set("stat:{0}".format(dataset), 0) + + def error(self, error): + print("!!ERROR!! {0}".format(error)) + sys.exit() + + +import_hash = import_hash() +#import_hash.download(dataset="nsrl_android") +import_hash.init(dataset="nsrl_android") +import_hash.process(dataset="nsrl_android") diff --git a/bin/import-poc/requirements b/bin/import-poc/requirements new file mode 100644 index 0000000..a7d3a7b --- /dev/null +++ b/bin/import-poc/requirements @@ -0,0 +1,5 @@ +pathlab +wget +json +zipfile +redis \ No newline at end of file From ca910ff22f2aed70b811039b1ab396789316a374 Mon Sep 17 00:00:00 2001 From: Koen Van Impe Date: Thu, 29 Jul 2021 10:46:59 +0200 Subject: [PATCH 2/2] Support for import of NSRL datasets in ISO and ZIP format --- bin/import-poc/README.md | 30 +++- bin/import-poc/config.json | 11 +- bin/import-poc/import-hashlookup-server.py | 156 +++++++++++++++++---- 3 files changed, 165 insertions(+), 32 deletions(-) diff --git a/bin/import-poc/README.md b/bin/import-poc/README.md index d3ee74c..d98e7ac 100644 --- a/bin/import-poc/README.md +++ b/bin/import-poc/README.md @@ -2,9 +2,33 @@ PoC to better streamline the import of NSRL data. -Todo: +## Usage -- Test with the other data sets (currently only Android was tested) : Fetch from ZIP and not ISO file +``` +$ python3 import-hashlookup-server.py -h +usage: import-hashlookup-server.py [-h] [-l | -i IMPORT_DATASET | -e INIT_DATASET] [-d] [-c] + +optional arguments: + -h, --help show this help message and exit + -l, --list List datasets available for download and import. + -i IMPORT_DATASET, --import-dataset IMPORT_DATASET + Import a dataset. + -e INIT_DATASET, --init-dataset INIT_DATASET + Remove / initialize a dataset. + -d, --skip-download Skip downloading the dataset. + -c, --skip-init Skip initialization of the database. +``` + +``` +$ python3 import-hashlookup-server.py -i nsrl_minimal +``` + +## Todo + + +- ~~Test with the other data sets (currently only Android was tested) : Fetch from ZIP and not ISO file~~ +- Move older input scripts to "old" directory +- Complete with sha256 and xcycl - Error handling (sufficient drive space, Redis active, check if there is already a db before init) - Multiple data sets at once? -- Import from MISP (depends on filter) +- Import from MISP (depends on filter) \ No newline at end of file diff --git a/bin/import-poc/config.json b/bin/import-poc/config.json index 1af0762..1746734 100644 --- a/bin/import-poc/config.json +++ b/bin/import-poc/config.json @@ -4,15 +4,20 @@ "nsrl_minimal": { "description": "(minimal) - contains the set of DISTINCT appearances of files in modern applications; no file entries are duplicated", "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernm.zip"}, "nsrl_android": { "description": "Contains modern Android mobile applications", - "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_android.iso"} + "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_android.iso"}, + "nsrl_unique": { "description": "Contains the set of file entries that appear ONLY ONCE in the entire NSRL collection; these are unique to each off the applications that are in the collection", + "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernu.zip"}, + "nsrl_ios": { "description": "IOS dataset", + "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_ios.iso"} }, "local_path": "/home/koenv/nsrl/", "import": { "max_value": 500000000, - "mod_lines": 1000 + "mod_lines": 2500 }, "redis": { "hostname": "127.0.0.1", - "port": 6666 + "port": 6666, + "flushdb_on_init": 1 } } \ No newline at end of file diff --git a/bin/import-poc/import-hashlookup-server.py b/bin/import-poc/import-hashlookup-server.py index 9a6b16e..67c4e72 100644 --- a/bin/import-poc/import-hashlookup-server.py +++ b/bin/import-poc/import-hashlookup-server.py @@ -5,6 +5,8 @@ import wget import sys import redis import json +import time +import argparse class import_hash: @@ -17,10 +19,14 @@ class import_hash: self.local_path = data["local_path"] redis_host = data["redis"]["hostname"] redis_port = data["redis"]["port"] + self.flushrdb = data["redis"]["flushdb_on_init"] self.rdb = redis.Redis(host=redis_host, port=redis_port, decode_responses=True) def download(self, dataset=False): + """ Download a dataset + :param dataset: The dataset to use. This is a key looked for in the config.json file to get the correct download URL + """ if not dataset: self.error("no dataset") @@ -28,12 +34,21 @@ class import_hash: wget.download(self.hash_datasets[dataset]["url"], self.local_path) print("\nDownload completed.") - def __process_nsrl_txt(self, isofile, dataset_file, key1, key2): - + def __process_nsrl_support(self, isofile, dataset_file, key): + """ Process support NSRL data (OS, Product, Vendor/Manufacturer) + :param isofile: The object to the ISO file. When set to False this indicates the NSRL is provided in a ZIP format + :param dataset_file: The location of the dataset, is either a path in the ISO or a direct filepath + :param key: type of support NSRL data + """ print("\n Work on {0}".format(dataset_file)) + if isofile: + with_element = isofile.IsoPath("/{0}".format(dataset_file)).open() + else: + with_element = open(dataset_file, encoding='utf-8') + ln = 0 - with isofile.IsoPath("/{0}".format(dataset_file)).open() as f: + with with_element as f: while True: l = f.readline() @@ -51,9 +66,10 @@ class import_hash: except: continue - self.rdb.sadd("s:{0}".format(key1), drecords[key1]) - self.rdb.hmset("h-{0}:{1}".format(key1, drecords[key1]), drecords) - self.rdb.incrby("stat:{0}-import".format(key2)) + self.rdb.sadd("s:{0}".format(key), drecords[key]) + self.rdb.hmset("h-{0}:{1}".format(key, drecords[key]), drecords) + stat_import_key = dataset_file[dataset_file.rfind("/")+1:dataset_file.rfind(".txt")] + self.rdb.incrby("stat:{0}-import".format(stat_import_key)) if ln % self.mod_lines == 0: print(" Imported {0} records.".format(ln)) @@ -63,19 +79,32 @@ class import_hash: ln = ln + 1 print(" Finished, importing {0} records.".format(ln)) - def __process_nsrl_zip(self, isofile, dataset_file, key): + def __process_nsrl_base(self, isofile, dataset_file, rdbkey): + """ Process base NSRL data (file hashes) + :param isofile: The object to the ISO file. When set to False this indicates the NSRL is provided in a ZIP format + :param dataset_file: The location of the dataset, is either a path in the ISO or a direct filepath + :param rdbkey: redis database key (corresponds with key of dataset in config.json) + """ print("\n Work on {0}".format(dataset_file)) - # First get the ZIP from the ISO and then extract the ZIP - zip_f = open(self.local_path + dataset_file, "wb") - with isofile.IsoPath("/{0}".format(dataset_file)).open("rb") as f: - zip_f.write((f.read())) - zip_f.close() - zip_f = zipfile.ZipFile(self.local_path + dataset_file) - zip_f.extractall(self.local_path) - + if isofile: + # We received the NSRL dataset as an ISO file + # First get the ZIP from the ISO and then extract the ZIP + zip_f = open(self.local_path + dataset_file, "wb") + with isofile.IsoPath("/{0}".format(dataset_file)).open("rb") as f: + zip_f.write((f.read())) + zip_f.close() + zip_f = zipfile.ZipFile(self.local_path + dataset_file) + zip_f.extractall(self.local_path) + + local_dataset_file = self.local_path + "NSRLFile.txt" + else: + # No need to do additional actions + # We probably received the NSRL as a ZIP file + local_dataset_file = dataset_file + ln = 0 - lines = open(self.local_path + "NSRLFile.txt", "r") + lines = open(local_dataset_file, "r") for l in lines: if ln == 0: @@ -89,9 +118,15 @@ class import_hash: except: continue + # Add some meta data + drecords['source'] = "NSRL" + drecords['db'] = rdbkey + drecords['insert-timestamp'] = time.time() + + # Base records self.rdb.set("l:{}".format(drecords['MD5']), drecords['SHA-1']) self.rdb.hmset("h:{}".format(drecords['SHA-1']), drecords) - self.rdb.incrby("stat:{0}".format(key)) + self.rdb.incrby("stat:{0}".format(rdbkey)) if ln % self.mod_lines == 0: print(" Imported {0} records.".format(ln)) @@ -101,6 +136,9 @@ class import_hash: print(" Finished, importing {0} records.".format(ln)) def process(self, dataset=False): + """Process a dataset + :param dataset: The dataset to process + """ if not dataset: self.error("no dataset") @@ -115,28 +153,94 @@ class import_hash: dataset_file_type = local_dataset[local_dataset.rfind(".")+1:] if dataset_file_type == "iso": + # We read directly from the ISO file isofile = pathlab.IsoAccessor(local_dataset) - self.__process_nsrl_zip(isofile, "NSRLFILE.ZIP", "NSRLAndroid") - self.__process_nsrl_txt(isofile, "NSRLMFG.TXT", "MfgCode", "NSRLMfg") - self.__process_nsrl_txt(isofile, "NSRLOS.TXT", "OpSystemCode", "NSRLOS") - self.__process_nsrl_txt(isofile, "NSRLPROD.TXT", "ProductCode", "NSRLProd") + self.__process_nsrl_base(isofile, "NSRLFILE.ZIP", dataset) + self.__process_nsrl_support(isofile, "NSRLMFG.TXT", "MfgCode") + self.__process_nsrl_support(isofile, "NSRLOS.TXT", "OpSystemCode") + self.__process_nsrl_support(isofile, "NSRLPROD.TXT", "ProductCode") + elif dataset_file_type == "zip": + # Extract the ZIP + zip_f = zipfile.ZipFile(local_dataset) + zip_f.extractall(self.local_path) + # NSRL ZIPs store the datafiles in a subdirectory + namelist_first = zip_f.namelist()[0] + zip_extract_path = "" + if namelist_first[-1] == "/": + zip_extract_path = self.local_path + namelist_first + # Indicate we don't have an ISO object + isofile = False + + self.__process_nsrl_base(isofile, zip_extract_path + "NSRLFile.txt", dataset) + self.__process_nsrl_support(isofile, zip_extract_path + "NSRLMfg.txt", "MfgCode") + self.__process_nsrl_support(isofile, zip_extract_path + "NSRLOS.txt", "OpSystemCode") + self.__process_nsrl_support(isofile, zip_extract_path + "NSRLProd.txt", "ProductCode") def init(self, dataset=False): + """ Remove / Initialize a dataset + :param dataset: Affected dataset + """ if not dataset: self.error("no dataset") print("**INIT** dataset {0} .".format(dataset)) - self.rdb.delete("stat:{0}".format(dataset)) - self.rdb.set("stat:{0}".format(dataset), 0) + if self.flushrdb: + self.rdb.flushdb() + else: + self.rdb.delete("stat:{0}".format(dataset)) + self.rdb.set("stat:{0}".format(dataset), 0) + + def datasetlist(self): + """ List the available datasets + """ + for nsrl in self.hash_datasets: + print("{0}\n {1}\n from: {2}\n".format(nsrl, self.hash_datasets[nsrl]["description"], self.hash_datasets[nsrl]["url"])) + + def valid_dataset(self, dataset): + """ Verify if the datset exist + :param dataset: Affected dataset + """ + if dataset in self.hash_datasets: + return True + else: + return False def error(self, error): + """ Return an error message and exit + :param error: Error message + """ print("!!ERROR!! {0}".format(error)) sys.exit() +parser = argparse.ArgumentParser() +group = parser.add_mutually_exclusive_group() +group.add_argument("-l", "--list", action="store_true", help="List datasets available for download and import.") +group.add_argument("-i", "--import-dataset", help="Import a dataset.") +group.add_argument("-e", "--init-dataset", help="Remove / initialize a dataset.") +parser.add_argument("-d", "--skip-download", action="store_true", help="Skip downloading the dataset.") +parser.add_argument("-c", "--skip-init", action="store_true", help="Skip initialization of the database.") +args = parser.parse_args() + import_hash = import_hash() -#import_hash.download(dataset="nsrl_android") -import_hash.init(dataset="nsrl_android") -import_hash.process(dataset="nsrl_android") + +if args.list: + import_hash.datasetlist() +elif args.import_dataset: + dataset = args.import_dataset + if import_hash.valid_dataset(dataset): + if not args.skip_download: + import_hash.download(dataset=dataset) + if not args.skip_init: + import_hash.init(dataset=dataset) + import_hash.process(dataset=dataset) + else: + print("Dataset not found.") +elif args.init_dataset: + dataset = args.init_dataset + if import_hash.valid_dataset(dataset): + import_hash.init(dataset=dataset) + else: + print("Dataset not found.")