PoC for streamlining import

PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android
2024-12-22 08:45:58 +00:00 · 2021-07-27 18:37:12 +02:00 · 2021-07-27 18:37:12 +02:00 · 0edb33ed82
commit 0edb33ed82
parent 97e2d2c8aa
4 changed files with 175 additions and 0 deletions
--- a/bin/import-poc/README.md
+++ b/bin/import-poc/README.md
@ -0,0 +1,10 @@
+# hashlookup-server
+
+PoC to better streamline the import of NSRL data.
+
+Todo:
+
+- Test with the other data sets (currently only Android was tested) : Fetch from ZIP and not ISO file
+- Error handling (sufficient drive space, Redis active, check if there is already a db before init)
+- Multiple data sets at once?
+- Import from MISP (depends on filter)
--- a/bin/import-poc/config.json
+++ b/bin/import-poc/config.json
@ -0,0 +1,18 @@
+{ "nsrl_downloads": {
+    "nsrl_modern_rds":   { "description": "(microcomputer applications) - contains the comprehensive set of ALL appearances of files in  modern applications; many file entries are duplicated",
+                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_modern.iso"},
+    "nsrl_minimal":      { "description": "(minimal) - contains the set of DISTINCT appearances of files in  modern applications; no file entries are duplicated",
+                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernm.zip"},
+    "nsrl_android":      { "description": "Contains modern Android mobile applications",
+                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_android.iso"}
+    },
+  "local_path": "/home/koenv/nsrl/",
+  "import": {
+      "max_value": 500000000,
+      "mod_lines": 1000
+  },
+  "redis": {
+      "hostname": "127.0.0.1",
+      "port": 6666
+  }
+}
--- a/bin/import-poc/import-hashlookup-server.py
+++ b/bin/import-poc/import-hashlookup-server.py
@ -0,0 +1,142 @@
+from pathlib import Path
+import pathlab
+import zipfile
+import wget
+import sys
+import redis
+import json
+
+
+class import_hash:
+    def __init__(self):
+        with open('config.json') as config_file:
+            data = json.load(config_file)
+            self.hash_datasets = data["nsrl_downloads"]
+            self.max_value = data["import"]["max_value"]
+            self.mod_lines = data["import"]["mod_lines"]
+            self.local_path = data["local_path"]
+            redis_host = data["redis"]["hostname"]
+            redis_port = data["redis"]["port"]
+
+        self.rdb = redis.Redis(host=redis_host, port=redis_port, decode_responses=True)
+
+    def download(self, dataset=False):
+        if not dataset:
+            self.error("no dataset")
+
+        print("**DOWNLOAD** dataset {0} from {1} to {2} .".format(dataset, self.hash_datasets[dataset]["url"], self.local_path))
+        wget.download(self.hash_datasets[dataset]["url"], self.local_path)
+        print("\nDownload completed.")
+
+    def __process_nsrl_txt(self, isofile, dataset_file, key1, key2):
+
+        print("\n Work on {0}".format(dataset_file))
+
+        ln = 0
+        with isofile.IsoPath("/{0}".format(dataset_file)).open() as f:
+            while True:
+                l = f.readline()
+
+                if not l:
+                    break
+
+                if ln == 0:
+                    headers = l.rstrip().replace("\"", "").split(",")
+                else:
+                    records = l.rstrip().replace("\"", "").split(",")
+                    drecords = {}
+                    for index, value in enumerate(records):
+                        try:
+                            drecords[headers[index]] = value
+                        except:
+                            continue
+
+                    self.rdb.sadd("s:{0}".format(key1), drecords[key1])
+                    self.rdb.hmset("h-{0}:{1}".format(key1, drecords[key1]), drecords)
+                    self.rdb.incrby("stat:{0}-import".format(key2))
+                    if ln % self.mod_lines == 0:
+                        print("  Imported {0} records.".format(ln))
+
+                if ln == self.max_value:
+                    break
+
+                ln = ln + 1
+            print("  Finished, importing {0} records.".format(ln))
+
+    def __process_nsrl_zip(self, isofile, dataset_file, key):
+        print("\n Work on {0}".format(dataset_file))
+
+        # First get the ZIP from the ISO and then extract the ZIP
+        zip_f = open(self.local_path + dataset_file, "wb")
+        with isofile.IsoPath("/{0}".format(dataset_file)).open("rb") as f:
+            zip_f.write((f.read()))
+        zip_f.close()
+        zip_f = zipfile.ZipFile(self.local_path + dataset_file)
+        zip_f.extractall(self.local_path)
+        
+        ln = 0
+        lines = open(self.local_path + "NSRLFile.txt", "r")
+
+        for l in lines:
+            if ln == 0:
+                headers = l.rstrip().replace("\"", "").split(",")
+            else:
+                records = l.rstrip().replace("\"", "").split(",")
+                drecords = {}
+                for index, value in enumerate(records):
+                    try:
+                        drecords[headers[index]] = value
+                    except:
+                        continue
+
+                self.rdb.set("l:{}".format(drecords['MD5']), drecords['SHA-1'])
+                self.rdb.hmset("h:{}".format(drecords['SHA-1']), drecords)
+                self.rdb.incrby("stat:{0}".format(key))
+                if ln % self.mod_lines == 0:
+                    print("  Imported {0} records.".format(ln))
+
+            if ln == self.max_value:
+                break
+            ln = ln + 1
+        print("  Finished, importing {0} records.".format(ln))
+
+    def process(self, dataset=False):
+        if not dataset:
+            self.error("no dataset")
+
+        local_dataset = self.local_path + self.hash_datasets[dataset]["url"][self.hash_datasets[dataset]["url"].rfind("/")+1:]
+        local_dataset.lower()
+        print("**PROCESS** dataset {0} from location {1} .".format(dataset, local_dataset))
+
+        if not Path(local_dataset).is_file():
+            self.error("Cannot find file {0}".format(local_dataset))
+
+        # Determine dataset file type
+        dataset_file_type = local_dataset[local_dataset.rfind(".")+1:]
+
+        if dataset_file_type == "iso":
+            isofile = pathlab.IsoAccessor(local_dataset)
+
+            self.__process_nsrl_zip(isofile, "NSRLFILE.ZIP", "NSRLAndroid")
+            self.__process_nsrl_txt(isofile, "NSRLMFG.TXT", "MfgCode", "NSRLMfg")
+            self.__process_nsrl_txt(isofile, "NSRLOS.TXT", "OpSystemCode", "NSRLOS")
+            self.__process_nsrl_txt(isofile, "NSRLPROD.TXT", "ProductCode", "NSRLProd")
+
+    def init(self, dataset=False):
+        if not dataset:
+            self.error("no dataset")
+
+        print("**INIT** dataset {0} .".format(dataset))
+
+        self.rdb.delete("stat:{0}".format(dataset))
+        self.rdb.set("stat:{0}".format(dataset), 0)
+
+    def error(self, error):
+        print("!!ERROR!! {0}".format(error))
+        sys.exit()
+
+
+import_hash = import_hash()
+#import_hash.download(dataset="nsrl_android")
+import_hash.init(dataset="nsrl_android")
+import_hash.process(dataset="nsrl_android")
--- a/bin/import-poc/requirements
+++ b/bin/import-poc/requirements
@ -0,0 +1,5 @@
+pathlab
+wget
+json
+zipfile
+redis