diff --git a/bin/import-poc/README.md b/bin/import-poc/README.md new file mode 100644 index 0000000..d98e7ac --- /dev/null +++ b/bin/import-poc/README.md @@ -0,0 +1,34 @@ +# hashlookup-server + +PoC to better streamline the import of NSRL data. + +## Usage + +``` +$ python3 import-hashlookup-server.py -h +usage: import-hashlookup-server.py [-h] [-l | -i IMPORT_DATASET | -e INIT_DATASET] [-d] [-c] + +optional arguments: + -h, --help show this help message and exit + -l, --list List datasets available for download and import. + -i IMPORT_DATASET, --import-dataset IMPORT_DATASET + Import a dataset. + -e INIT_DATASET, --init-dataset INIT_DATASET + Remove / initialize a dataset. + -d, --skip-download Skip downloading the dataset. + -c, --skip-init Skip initialization of the database. +``` + +``` +$ python3 import-hashlookup-server.py -i nsrl_minimal +``` + +## Todo + + +- ~~Test with the other data sets (currently only Android was tested) : Fetch from ZIP and not ISO file~~ +- Move older input scripts to "old" directory +- Complete with sha256 and xcycl +- Error handling (sufficient drive space, Redis active, check if there is already a db before init) +- Multiple data sets at once? +- Import from MISP (depends on filter) \ No newline at end of file diff --git a/bin/import-poc/config.json b/bin/import-poc/config.json new file mode 100644 index 0000000..1746734 --- /dev/null +++ b/bin/import-poc/config.json @@ -0,0 +1,23 @@ +{ "nsrl_downloads": { + "nsrl_modern_rds": { "description": "(microcomputer applications) - contains the comprehensive set of ALL appearances of files in modern applications; many file entries are duplicated", + "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_modern.iso"}, + "nsrl_minimal": { "description": "(minimal) - contains the set of DISTINCT appearances of files in modern applications; no file entries are duplicated", + "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernm.zip"}, + "nsrl_android": { "description": "Contains modern Android mobile applications", + "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_android.iso"}, + "nsrl_unique": { "description": "Contains the set of file entries that appear ONLY ONCE in the entire NSRL collection; these are unique to each off the applications that are in the collection", + "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernu.zip"}, + "nsrl_ios": { "description": "IOS dataset", + "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_ios.iso"} + }, + "local_path": "/home/koenv/nsrl/", + "import": { + "max_value": 500000000, + "mod_lines": 2500 + }, + "redis": { + "hostname": "127.0.0.1", + "port": 6666, + "flushdb_on_init": 1 + } +} \ No newline at end of file diff --git a/bin/import-poc/import-hashlookup-server.py b/bin/import-poc/import-hashlookup-server.py new file mode 100644 index 0000000..67c4e72 --- /dev/null +++ b/bin/import-poc/import-hashlookup-server.py @@ -0,0 +1,246 @@ +from pathlib import Path +import pathlab +import zipfile +import wget +import sys +import redis +import json +import time +import argparse + + +class import_hash: + def __init__(self): + with open('config.json') as config_file: + data = json.load(config_file) + self.hash_datasets = data["nsrl_downloads"] + self.max_value = data["import"]["max_value"] + self.mod_lines = data["import"]["mod_lines"] + self.local_path = data["local_path"] + redis_host = data["redis"]["hostname"] + redis_port = data["redis"]["port"] + self.flushrdb = data["redis"]["flushdb_on_init"] + + self.rdb = redis.Redis(host=redis_host, port=redis_port, decode_responses=True) + + def download(self, dataset=False): + """ Download a dataset + :param dataset: The dataset to use. This is a key looked for in the config.json file to get the correct download URL + """ + if not dataset: + self.error("no dataset") + + print("**DOWNLOAD** dataset {0} from {1} to {2} .".format(dataset, self.hash_datasets[dataset]["url"], self.local_path)) + wget.download(self.hash_datasets[dataset]["url"], self.local_path) + print("\nDownload completed.") + + def __process_nsrl_support(self, isofile, dataset_file, key): + """ Process support NSRL data (OS, Product, Vendor/Manufacturer) + :param isofile: The object to the ISO file. When set to False this indicates the NSRL is provided in a ZIP format + :param dataset_file: The location of the dataset, is either a path in the ISO or a direct filepath + :param key: type of support NSRL data + """ + print("\n Work on {0}".format(dataset_file)) + + if isofile: + with_element = isofile.IsoPath("/{0}".format(dataset_file)).open() + else: + with_element = open(dataset_file, encoding='utf-8') + + ln = 0 + with with_element as f: + while True: + l = f.readline() + + if not l: + break + + if ln == 0: + headers = l.rstrip().replace("\"", "").split(",") + else: + records = l.rstrip().replace("\"", "").split(",") + drecords = {} + for index, value in enumerate(records): + try: + drecords[headers[index]] = value + except: + continue + + self.rdb.sadd("s:{0}".format(key), drecords[key]) + self.rdb.hmset("h-{0}:{1}".format(key, drecords[key]), drecords) + stat_import_key = dataset_file[dataset_file.rfind("/")+1:dataset_file.rfind(".txt")] + self.rdb.incrby("stat:{0}-import".format(stat_import_key)) + if ln % self.mod_lines == 0: + print(" Imported {0} records.".format(ln)) + + if ln == self.max_value: + break + + ln = ln + 1 + print(" Finished, importing {0} records.".format(ln)) + + def __process_nsrl_base(self, isofile, dataset_file, rdbkey): + """ Process base NSRL data (file hashes) + :param isofile: The object to the ISO file. When set to False this indicates the NSRL is provided in a ZIP format + :param dataset_file: The location of the dataset, is either a path in the ISO or a direct filepath + :param rdbkey: redis database key (corresponds with key of dataset in config.json) + """ + print("\n Work on {0}".format(dataset_file)) + + if isofile: + # We received the NSRL dataset as an ISO file + # First get the ZIP from the ISO and then extract the ZIP + zip_f = open(self.local_path + dataset_file, "wb") + with isofile.IsoPath("/{0}".format(dataset_file)).open("rb") as f: + zip_f.write((f.read())) + zip_f.close() + zip_f = zipfile.ZipFile(self.local_path + dataset_file) + zip_f.extractall(self.local_path) + + local_dataset_file = self.local_path + "NSRLFile.txt" + else: + # No need to do additional actions + # We probably received the NSRL as a ZIP file + local_dataset_file = dataset_file + + ln = 0 + lines = open(local_dataset_file, "r") + + for l in lines: + if ln == 0: + headers = l.rstrip().replace("\"", "").split(",") + else: + records = l.rstrip().replace("\"", "").split(",") + drecords = {} + for index, value in enumerate(records): + try: + drecords[headers[index]] = value + except: + continue + + # Add some meta data + drecords['source'] = "NSRL" + drecords['db'] = rdbkey + drecords['insert-timestamp'] = time.time() + + # Base records + self.rdb.set("l:{}".format(drecords['MD5']), drecords['SHA-1']) + self.rdb.hmset("h:{}".format(drecords['SHA-1']), drecords) + self.rdb.incrby("stat:{0}".format(rdbkey)) + if ln % self.mod_lines == 0: + print(" Imported {0} records.".format(ln)) + + if ln == self.max_value: + break + ln = ln + 1 + print(" Finished, importing {0} records.".format(ln)) + + def process(self, dataset=False): + """Process a dataset + :param dataset: The dataset to process + """ + if not dataset: + self.error("no dataset") + + local_dataset = self.local_path + self.hash_datasets[dataset]["url"][self.hash_datasets[dataset]["url"].rfind("/")+1:] + local_dataset.lower() + print("**PROCESS** dataset {0} from location {1} .".format(dataset, local_dataset)) + + if not Path(local_dataset).is_file(): + self.error("Cannot find file {0}".format(local_dataset)) + + # Determine dataset file type + dataset_file_type = local_dataset[local_dataset.rfind(".")+1:] + + if dataset_file_type == "iso": + # We read directly from the ISO file + isofile = pathlab.IsoAccessor(local_dataset) + + self.__process_nsrl_base(isofile, "NSRLFILE.ZIP", dataset) + self.__process_nsrl_support(isofile, "NSRLMFG.TXT", "MfgCode") + self.__process_nsrl_support(isofile, "NSRLOS.TXT", "OpSystemCode") + self.__process_nsrl_support(isofile, "NSRLPROD.TXT", "ProductCode") + elif dataset_file_type == "zip": + # Extract the ZIP + zip_f = zipfile.ZipFile(local_dataset) + zip_f.extractall(self.local_path) + # NSRL ZIPs store the datafiles in a subdirectory + namelist_first = zip_f.namelist()[0] + zip_extract_path = "" + if namelist_first[-1] == "/": + zip_extract_path = self.local_path + namelist_first + # Indicate we don't have an ISO object + isofile = False + + self.__process_nsrl_base(isofile, zip_extract_path + "NSRLFile.txt", dataset) + self.__process_nsrl_support(isofile, zip_extract_path + "NSRLMfg.txt", "MfgCode") + self.__process_nsrl_support(isofile, zip_extract_path + "NSRLOS.txt", "OpSystemCode") + self.__process_nsrl_support(isofile, zip_extract_path + "NSRLProd.txt", "ProductCode") + + def init(self, dataset=False): + """ Remove / Initialize a dataset + :param dataset: Affected dataset + """ + if not dataset: + self.error("no dataset") + + print("**INIT** dataset {0} .".format(dataset)) + + if self.flushrdb: + self.rdb.flushdb() + else: + self.rdb.delete("stat:{0}".format(dataset)) + self.rdb.set("stat:{0}".format(dataset), 0) + + def datasetlist(self): + """ List the available datasets + """ + for nsrl in self.hash_datasets: + print("{0}\n {1}\n from: {2}\n".format(nsrl, self.hash_datasets[nsrl]["description"], self.hash_datasets[nsrl]["url"])) + + def valid_dataset(self, dataset): + """ Verify if the datset exist + :param dataset: Affected dataset + """ + if dataset in self.hash_datasets: + return True + else: + return False + + def error(self, error): + """ Return an error message and exit + :param error: Error message + """ + print("!!ERROR!! {0}".format(error)) + sys.exit() + + +parser = argparse.ArgumentParser() +group = parser.add_mutually_exclusive_group() +group.add_argument("-l", "--list", action="store_true", help="List datasets available for download and import.") +group.add_argument("-i", "--import-dataset", help="Import a dataset.") +group.add_argument("-e", "--init-dataset", help="Remove / initialize a dataset.") +parser.add_argument("-d", "--skip-download", action="store_true", help="Skip downloading the dataset.") +parser.add_argument("-c", "--skip-init", action="store_true", help="Skip initialization of the database.") +args = parser.parse_args() + +import_hash = import_hash() + +if args.list: + import_hash.datasetlist() +elif args.import_dataset: + dataset = args.import_dataset + if import_hash.valid_dataset(dataset): + if not args.skip_download: + import_hash.download(dataset=dataset) + if not args.skip_init: + import_hash.init(dataset=dataset) + import_hash.process(dataset=dataset) + else: + print("Dataset not found.") +elif args.init_dataset: + dataset = args.init_dataset + if import_hash.valid_dataset(dataset): + import_hash.init(dataset=dataset) + else: + print("Dataset not found.") diff --git a/bin/import-poc/requirements b/bin/import-poc/requirements new file mode 100644 index 0000000..a7d3a7b --- /dev/null +++ b/bin/import-poc/requirements @@ -0,0 +1,5 @@ +pathlab +wget +json +zipfile +redis \ No newline at end of file