From 0edb33ed82c75c880d61ec658ff7416ba2e892d2 Mon Sep 17 00:00:00 2001 From: Koen Van Impe Date: Tue, 27 Jul 2021 18:37:12 +0200 Subject: [PATCH] PoC for streamlining import PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android --- bin/import-poc/README.md | 10 ++ bin/import-poc/config.json | 18 +++ bin/import-poc/import-hashlookup-server.py | 142 +++++++++++++++++++++ bin/import-poc/requirements | 5 + 4 files changed, 175 insertions(+) create mode 100644 bin/import-poc/README.md create mode 100644 bin/import-poc/config.json create mode 100644 bin/import-poc/import-hashlookup-server.py create mode 100644 bin/import-poc/requirements diff --git a/bin/import-poc/README.md b/bin/import-poc/README.md new file mode 100644 index 0000000..d3ee74c --- /dev/null +++ b/bin/import-poc/README.md @@ -0,0 +1,10 @@ +# hashlookup-server + +PoC to better streamline the import of NSRL data. + +Todo: + +- Test with the other data sets (currently only Android was tested) : Fetch from ZIP and not ISO file +- Error handling (sufficient drive space, Redis active, check if there is already a db before init) +- Multiple data sets at once? +- Import from MISP (depends on filter) diff --git a/bin/import-poc/config.json b/bin/import-poc/config.json new file mode 100644 index 0000000..1af0762 --- /dev/null +++ b/bin/import-poc/config.json @@ -0,0 +1,18 @@ +{ "nsrl_downloads": { + "nsrl_modern_rds": { "description": "(microcomputer applications) - contains the comprehensive set of ALL appearances of files in modern applications; many file entries are duplicated", + "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_modern.iso"}, + "nsrl_minimal": { "description": "(minimal) - contains the set of DISTINCT appearances of files in modern applications; no file entries are duplicated", + "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernm.zip"}, + "nsrl_android": { "description": "Contains modern Android mobile applications", + "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_android.iso"} + }, + "local_path": "/home/koenv/nsrl/", + "import": { + "max_value": 500000000, + "mod_lines": 1000 + }, + "redis": { + "hostname": "127.0.0.1", + "port": 6666 + } +} \ No newline at end of file diff --git a/bin/import-poc/import-hashlookup-server.py b/bin/import-poc/import-hashlookup-server.py new file mode 100644 index 0000000..9a6b16e --- /dev/null +++ b/bin/import-poc/import-hashlookup-server.py @@ -0,0 +1,142 @@ +from pathlib import Path +import pathlab +import zipfile +import wget +import sys +import redis +import json + + +class import_hash: + def __init__(self): + with open('config.json') as config_file: + data = json.load(config_file) + self.hash_datasets = data["nsrl_downloads"] + self.max_value = data["import"]["max_value"] + self.mod_lines = data["import"]["mod_lines"] + self.local_path = data["local_path"] + redis_host = data["redis"]["hostname"] + redis_port = data["redis"]["port"] + + self.rdb = redis.Redis(host=redis_host, port=redis_port, decode_responses=True) + + def download(self, dataset=False): + if not dataset: + self.error("no dataset") + + print("**DOWNLOAD** dataset {0} from {1} to {2} .".format(dataset, self.hash_datasets[dataset]["url"], self.local_path)) + wget.download(self.hash_datasets[dataset]["url"], self.local_path) + print("\nDownload completed.") + + def __process_nsrl_txt(self, isofile, dataset_file, key1, key2): + + print("\n Work on {0}".format(dataset_file)) + + ln = 0 + with isofile.IsoPath("/{0}".format(dataset_file)).open() as f: + while True: + l = f.readline() + + if not l: + break + + if ln == 0: + headers = l.rstrip().replace("\"", "").split(",") + else: + records = l.rstrip().replace("\"", "").split(",") + drecords = {} + for index, value in enumerate(records): + try: + drecords[headers[index]] = value + except: + continue + + self.rdb.sadd("s:{0}".format(key1), drecords[key1]) + self.rdb.hmset("h-{0}:{1}".format(key1, drecords[key1]), drecords) + self.rdb.incrby("stat:{0}-import".format(key2)) + if ln % self.mod_lines == 0: + print(" Imported {0} records.".format(ln)) + + if ln == self.max_value: + break + + ln = ln + 1 + print(" Finished, importing {0} records.".format(ln)) + + def __process_nsrl_zip(self, isofile, dataset_file, key): + print("\n Work on {0}".format(dataset_file)) + + # First get the ZIP from the ISO and then extract the ZIP + zip_f = open(self.local_path + dataset_file, "wb") + with isofile.IsoPath("/{0}".format(dataset_file)).open("rb") as f: + zip_f.write((f.read())) + zip_f.close() + zip_f = zipfile.ZipFile(self.local_path + dataset_file) + zip_f.extractall(self.local_path) + + ln = 0 + lines = open(self.local_path + "NSRLFile.txt", "r") + + for l in lines: + if ln == 0: + headers = l.rstrip().replace("\"", "").split(",") + else: + records = l.rstrip().replace("\"", "").split(",") + drecords = {} + for index, value in enumerate(records): + try: + drecords[headers[index]] = value + except: + continue + + self.rdb.set("l:{}".format(drecords['MD5']), drecords['SHA-1']) + self.rdb.hmset("h:{}".format(drecords['SHA-1']), drecords) + self.rdb.incrby("stat:{0}".format(key)) + if ln % self.mod_lines == 0: + print(" Imported {0} records.".format(ln)) + + if ln == self.max_value: + break + ln = ln + 1 + print(" Finished, importing {0} records.".format(ln)) + + def process(self, dataset=False): + if not dataset: + self.error("no dataset") + + local_dataset = self.local_path + self.hash_datasets[dataset]["url"][self.hash_datasets[dataset]["url"].rfind("/")+1:] + local_dataset.lower() + print("**PROCESS** dataset {0} from location {1} .".format(dataset, local_dataset)) + + if not Path(local_dataset).is_file(): + self.error("Cannot find file {0}".format(local_dataset)) + + # Determine dataset file type + dataset_file_type = local_dataset[local_dataset.rfind(".")+1:] + + if dataset_file_type == "iso": + isofile = pathlab.IsoAccessor(local_dataset) + + self.__process_nsrl_zip(isofile, "NSRLFILE.ZIP", "NSRLAndroid") + self.__process_nsrl_txt(isofile, "NSRLMFG.TXT", "MfgCode", "NSRLMfg") + self.__process_nsrl_txt(isofile, "NSRLOS.TXT", "OpSystemCode", "NSRLOS") + self.__process_nsrl_txt(isofile, "NSRLPROD.TXT", "ProductCode", "NSRLProd") + + def init(self, dataset=False): + if not dataset: + self.error("no dataset") + + print("**INIT** dataset {0} .".format(dataset)) + + self.rdb.delete("stat:{0}".format(dataset)) + self.rdb.set("stat:{0}".format(dataset), 0) + + def error(self, error): + print("!!ERROR!! {0}".format(error)) + sys.exit() + + +import_hash = import_hash() +#import_hash.download(dataset="nsrl_android") +import_hash.init(dataset="nsrl_android") +import_hash.process(dataset="nsrl_android") diff --git a/bin/import-poc/requirements b/bin/import-poc/requirements new file mode 100644 index 0000000..a7d3a7b --- /dev/null +++ b/bin/import-poc/requirements @@ -0,0 +1,5 @@ +pathlab +wget +json +zipfile +redis \ No newline at end of file