From 0edb33ed82c75c880d61ec658ff7416ba2e892d2 Mon Sep 17 00:00:00 2001
From: Koen Van Impe <koen.vanimpe@cudeso.be>
Date: Tue, 27 Jul 2021 18:37:12 +0200
Subject: [PATCH 1/2] PoC for streamlining import

PoC to better streamline the import of NSRL data.
Still requires some work but basic concept works.
Currently only tested with Android
---
 bin/import-poc/README.md                   |  10 ++
 bin/import-poc/config.json                 |  18 +++
 bin/import-poc/import-hashlookup-server.py | 142 +++++++++++++++++++++
 bin/import-poc/requirements                |   5 +
 4 files changed, 175 insertions(+)
 create mode 100644 bin/import-poc/README.md
 create mode 100644 bin/import-poc/config.json
 create mode 100644 bin/import-poc/import-hashlookup-server.py
 create mode 100644 bin/import-poc/requirements

diff --git a/bin/import-poc/README.md b/bin/import-poc/README.md
new file mode 100644
index 0000000..d3ee74c
--- /dev/null
+++ b/bin/import-poc/README.md
@@ -0,0 +1,10 @@
+# hashlookup-server
+
+PoC to better streamline the import of NSRL data.
+
+Todo:
+
+- Test with the other data sets (currently only Android was tested) : Fetch from ZIP and not ISO file
+- Error handling (sufficient drive space, Redis active, check if there is already a db before init)
+- Multiple data sets at once?
+- Import from MISP (depends on filter)
diff --git a/bin/import-poc/config.json b/bin/import-poc/config.json
new file mode 100644
index 0000000..1af0762
--- /dev/null
+++ b/bin/import-poc/config.json
@@ -0,0 +1,18 @@
+{ "nsrl_downloads": {
+    "nsrl_modern_rds":   { "description": "(microcomputer applications) - contains the comprehensive set of ALL appearances of files in  modern applications; many file entries are duplicated",
+                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_modern.iso"},
+    "nsrl_minimal":      { "description": "(minimal) - contains the set of DISTINCT appearances of files in  modern applications; no file entries are duplicated",
+                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernm.zip"},
+    "nsrl_android":      { "description": "Contains modern Android mobile applications",
+                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_android.iso"}
+    },
+  "local_path": "/home/koenv/nsrl/",
+  "import": {
+      "max_value": 500000000,
+      "mod_lines": 1000
+  },
+  "redis": {
+      "hostname": "127.0.0.1",
+      "port": 6666
+  }
+}
\ No newline at end of file
diff --git a/bin/import-poc/import-hashlookup-server.py b/bin/import-poc/import-hashlookup-server.py
new file mode 100644
index 0000000..9a6b16e
--- /dev/null
+++ b/bin/import-poc/import-hashlookup-server.py
@@ -0,0 +1,142 @@
+from pathlib import Path
+import pathlab
+import zipfile
+import wget
+import sys
+import redis
+import json
+
+
+class import_hash:
+    def __init__(self):
+        with open('config.json') as config_file:
+            data = json.load(config_file)
+            self.hash_datasets = data["nsrl_downloads"]
+            self.max_value = data["import"]["max_value"]
+            self.mod_lines = data["import"]["mod_lines"]
+            self.local_path = data["local_path"]
+            redis_host = data["redis"]["hostname"]
+            redis_port = data["redis"]["port"]
+
+        self.rdb = redis.Redis(host=redis_host, port=redis_port, decode_responses=True)
+
+    def download(self, dataset=False):
+        if not dataset:
+            self.error("no dataset")
+
+        print("**DOWNLOAD** dataset {0} from {1} to {2} .".format(dataset, self.hash_datasets[dataset]["url"], self.local_path))
+        wget.download(self.hash_datasets[dataset]["url"], self.local_path)
+        print("\nDownload completed.")
+
+    def __process_nsrl_txt(self, isofile, dataset_file, key1, key2):
+
+        print("\n Work on {0}".format(dataset_file))
+
+        ln = 0
+        with isofile.IsoPath("/{0}".format(dataset_file)).open() as f:
+            while True:
+                l = f.readline()
+
+                if not l:
+                    break
+
+                if ln == 0:
+                    headers = l.rstrip().replace("\"", "").split(",")
+                else:
+                    records = l.rstrip().replace("\"", "").split(",")
+                    drecords = {}
+                    for index, value in enumerate(records):
+                        try:
+                            drecords[headers[index]] = value
+                        except:
+                            continue
+
+                    self.rdb.sadd("s:{0}".format(key1), drecords[key1])
+                    self.rdb.hmset("h-{0}:{1}".format(key1, drecords[key1]), drecords)
+                    self.rdb.incrby("stat:{0}-import".format(key2))
+                    if ln % self.mod_lines == 0:
+                        print("  Imported {0} records.".format(ln))
+
+                if ln == self.max_value:
+                    break
+
+                ln = ln + 1
+            print("  Finished, importing {0} records.".format(ln))
+
+    def __process_nsrl_zip(self, isofile, dataset_file, key):
+        print("\n Work on {0}".format(dataset_file))
+
+        # First get the ZIP from the ISO and then extract the ZIP
+        zip_f = open(self.local_path + dataset_file, "wb")
+        with isofile.IsoPath("/{0}".format(dataset_file)).open("rb") as f:
+            zip_f.write((f.read()))
+        zip_f.close()
+        zip_f = zipfile.ZipFile(self.local_path + dataset_file)
+        zip_f.extractall(self.local_path)
+        
+        ln = 0
+        lines = open(self.local_path + "NSRLFile.txt", "r")
+
+        for l in lines:
+            if ln == 0:
+                headers = l.rstrip().replace("\"", "").split(",")
+            else:
+                records = l.rstrip().replace("\"", "").split(",")
+                drecords = {}
+                for index, value in enumerate(records):
+                    try:
+                        drecords[headers[index]] = value
+                    except:
+                        continue
+
+                self.rdb.set("l:{}".format(drecords['MD5']), drecords['SHA-1'])
+                self.rdb.hmset("h:{}".format(drecords['SHA-1']), drecords)
+                self.rdb.incrby("stat:{0}".format(key))
+                if ln % self.mod_lines == 0:
+                    print("  Imported {0} records.".format(ln))
+
+            if ln == self.max_value:
+                break
+            ln = ln + 1
+        print("  Finished, importing {0} records.".format(ln))
+
+    def process(self, dataset=False):
+        if not dataset:
+            self.error("no dataset")
+
+        local_dataset = self.local_path + self.hash_datasets[dataset]["url"][self.hash_datasets[dataset]["url"].rfind("/")+1:]
+        local_dataset.lower()
+        print("**PROCESS** dataset {0} from location {1} .".format(dataset, local_dataset))
+
+        if not Path(local_dataset).is_file():
+            self.error("Cannot find file {0}".format(local_dataset))
+
+        # Determine dataset file type
+        dataset_file_type = local_dataset[local_dataset.rfind(".")+1:]
+
+        if dataset_file_type == "iso":
+            isofile = pathlab.IsoAccessor(local_dataset)
+
+            self.__process_nsrl_zip(isofile, "NSRLFILE.ZIP", "NSRLAndroid")
+            self.__process_nsrl_txt(isofile, "NSRLMFG.TXT", "MfgCode", "NSRLMfg")
+            self.__process_nsrl_txt(isofile, "NSRLOS.TXT", "OpSystemCode", "NSRLOS")
+            self.__process_nsrl_txt(isofile, "NSRLPROD.TXT", "ProductCode", "NSRLProd")
+
+    def init(self, dataset=False):
+        if not dataset:
+            self.error("no dataset")
+
+        print("**INIT** dataset {0} .".format(dataset))
+
+        self.rdb.delete("stat:{0}".format(dataset))
+        self.rdb.set("stat:{0}".format(dataset), 0)
+
+    def error(self, error):
+        print("!!ERROR!! {0}".format(error))
+        sys.exit()
+
+
+import_hash = import_hash()
+#import_hash.download(dataset="nsrl_android")
+import_hash.init(dataset="nsrl_android")
+import_hash.process(dataset="nsrl_android")
diff --git a/bin/import-poc/requirements b/bin/import-poc/requirements
new file mode 100644
index 0000000..a7d3a7b
--- /dev/null
+++ b/bin/import-poc/requirements
@@ -0,0 +1,5 @@
+pathlab
+wget
+json
+zipfile
+redis
\ No newline at end of file

From ca910ff22f2aed70b811039b1ab396789316a374 Mon Sep 17 00:00:00 2001
From: Koen Van Impe <koen.vanimpe@cudeso.be>
Date: Thu, 29 Jul 2021 10:46:59 +0200
Subject: [PATCH 2/2] Support for import of NSRL datasets in ISO and ZIP format

---
 bin/import-poc/README.md                   |  30 +++-
 bin/import-poc/config.json                 |  11 +-
 bin/import-poc/import-hashlookup-server.py | 156 +++++++++++++++++----
 3 files changed, 165 insertions(+), 32 deletions(-)

diff --git a/bin/import-poc/README.md b/bin/import-poc/README.md
index d3ee74c..d98e7ac 100644
--- a/bin/import-poc/README.md
+++ b/bin/import-poc/README.md
@@ -2,9 +2,33 @@
 
 PoC to better streamline the import of NSRL data.
 
-Todo:
+## Usage
 
-- Test with the other data sets (currently only Android was tested) : Fetch from ZIP and not ISO file
+```
+$ python3 import-hashlookup-server.py -h
+usage: import-hashlookup-server.py [-h] [-l | -i IMPORT_DATASET | -e INIT_DATASET] [-d] [-c]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -l, --list            List datasets available for download and import.
+  -i IMPORT_DATASET, --import-dataset IMPORT_DATASET
+                        Import a dataset.
+  -e INIT_DATASET, --init-dataset INIT_DATASET
+                        Remove / initialize a dataset.
+  -d, --skip-download   Skip downloading the dataset.
+  -c, --skip-init       Skip initialization of the database.
+```
+
+```
+$ python3 import-hashlookup-server.py -i nsrl_minimal
+```
+
+## Todo
+
+
+- ~~Test with the other data sets (currently only Android was tested) : Fetch from ZIP and not ISO file~~
+- Move older input scripts to "old" directory 
+- Complete with sha256 and xcycl
 - Error handling (sufficient drive space, Redis active, check if there is already a db before init)
 - Multiple data sets at once?
-- Import from MISP (depends on filter)
+- Import from MISP (depends on filter)
\ No newline at end of file
diff --git a/bin/import-poc/config.json b/bin/import-poc/config.json
index 1af0762..1746734 100644
--- a/bin/import-poc/config.json
+++ b/bin/import-poc/config.json
@@ -4,15 +4,20 @@
     "nsrl_minimal":      { "description": "(minimal) - contains the set of DISTINCT appearances of files in  modern applications; no file entries are duplicated",
                         "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernm.zip"},
     "nsrl_android":      { "description": "Contains modern Android mobile applications",
-                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_android.iso"}
+                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_android.iso"},
+    "nsrl_unique":       { "description": "Contains the set of file entries that appear ONLY ONCE  in the entire NSRL collection; these are unique to each off the applications that are in the collection",
+                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernu.zip"},
+    "nsrl_ios":          { "description": "IOS dataset",
+                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_ios.iso"}
     },
   "local_path": "/home/koenv/nsrl/",
   "import": {
       "max_value": 500000000,
-      "mod_lines": 1000
+      "mod_lines": 2500
   },
   "redis": {
       "hostname": "127.0.0.1",
-      "port": 6666
+      "port": 6666,
+      "flushdb_on_init": 1
   }
 }
\ No newline at end of file
diff --git a/bin/import-poc/import-hashlookup-server.py b/bin/import-poc/import-hashlookup-server.py
index 9a6b16e..67c4e72 100644
--- a/bin/import-poc/import-hashlookup-server.py
+++ b/bin/import-poc/import-hashlookup-server.py
@@ -5,6 +5,8 @@ import wget
 import sys
 import redis
 import json
+import time
+import argparse
 
 
 class import_hash:
@@ -17,10 +19,14 @@ class import_hash:
             self.local_path = data["local_path"]
             redis_host = data["redis"]["hostname"]
             redis_port = data["redis"]["port"]
+            self.flushrdb = data["redis"]["flushdb_on_init"]
 
         self.rdb = redis.Redis(host=redis_host, port=redis_port, decode_responses=True)
 
     def download(self, dataset=False):
+        """ Download a dataset
+        :param dataset: The dataset to use. This is a key looked for in the config.json file to get the correct download URL
+        """
         if not dataset:
             self.error("no dataset")
 
@@ -28,12 +34,21 @@ class import_hash:
         wget.download(self.hash_datasets[dataset]["url"], self.local_path)
         print("\nDownload completed.")
 
-    def __process_nsrl_txt(self, isofile, dataset_file, key1, key2):
-
+    def __process_nsrl_support(self, isofile, dataset_file, key):
+        """ Process support NSRL data (OS, Product, Vendor/Manufacturer)
+        :param isofile: The object to the ISO file. When set to False this indicates the NSRL is provided in a ZIP format
+        :param dataset_file: The location of the dataset, is either a path in the ISO or a direct filepath
+        :param key: type of support NSRL data
+        """
         print("\n Work on {0}".format(dataset_file))
 
+        if isofile:
+            with_element = isofile.IsoPath("/{0}".format(dataset_file)).open()
+        else:
+            with_element = open(dataset_file, encoding='utf-8')
+
         ln = 0
-        with isofile.IsoPath("/{0}".format(dataset_file)).open() as f:
+        with with_element as f:
             while True:
                 l = f.readline()
 
@@ -51,9 +66,10 @@ class import_hash:
                         except:
                             continue
 
-                    self.rdb.sadd("s:{0}".format(key1), drecords[key1])
-                    self.rdb.hmset("h-{0}:{1}".format(key1, drecords[key1]), drecords)
-                    self.rdb.incrby("stat:{0}-import".format(key2))
+                    self.rdb.sadd("s:{0}".format(key), drecords[key])
+                    self.rdb.hmset("h-{0}:{1}".format(key, drecords[key]), drecords)
+                    stat_import_key = dataset_file[dataset_file.rfind("/")+1:dataset_file.rfind(".txt")]
+                    self.rdb.incrby("stat:{0}-import".format(stat_import_key))
                     if ln % self.mod_lines == 0:
                         print("  Imported {0} records.".format(ln))
 
@@ -63,19 +79,32 @@ class import_hash:
                 ln = ln + 1
             print("  Finished, importing {0} records.".format(ln))
 
-    def __process_nsrl_zip(self, isofile, dataset_file, key):
+    def __process_nsrl_base(self, isofile, dataset_file, rdbkey):
+        """ Process base NSRL data (file hashes)
+        :param isofile: The object to the ISO file. When set to False this indicates the NSRL is provided in a ZIP format
+        :param dataset_file: The location of the dataset, is either a path in the ISO or a direct filepath
+        :param rdbkey: redis database key (corresponds with key of dataset in config.json)
+        """
         print("\n Work on {0}".format(dataset_file))
 
-        # First get the ZIP from the ISO and then extract the ZIP
-        zip_f = open(self.local_path + dataset_file, "wb")
-        with isofile.IsoPath("/{0}".format(dataset_file)).open("rb") as f:
-            zip_f.write((f.read()))
-        zip_f.close()
-        zip_f = zipfile.ZipFile(self.local_path + dataset_file)
-        zip_f.extractall(self.local_path)
-        
+        if isofile:
+            # We received the NSRL dataset as an ISO file
+            # First get the ZIP from the ISO and then extract the ZIP
+            zip_f = open(self.local_path + dataset_file, "wb")
+            with isofile.IsoPath("/{0}".format(dataset_file)).open("rb") as f:
+                zip_f.write((f.read()))
+            zip_f.close()
+            zip_f = zipfile.ZipFile(self.local_path + dataset_file)
+            zip_f.extractall(self.local_path)
+
+            local_dataset_file = self.local_path + "NSRLFile.txt"
+        else:
+            # No need to do additional actions
+            # We probably received the NSRL as a ZIP file
+            local_dataset_file = dataset_file
+
         ln = 0
-        lines = open(self.local_path + "NSRLFile.txt", "r")
+        lines = open(local_dataset_file, "r")
 
         for l in lines:
             if ln == 0:
@@ -89,9 +118,15 @@ class import_hash:
                     except:
                         continue
 
+                # Add some meta data
+                drecords['source'] = "NSRL"
+                drecords['db'] = rdbkey
+                drecords['insert-timestamp'] = time.time()
+
+                # Base records
                 self.rdb.set("l:{}".format(drecords['MD5']), drecords['SHA-1'])
                 self.rdb.hmset("h:{}".format(drecords['SHA-1']), drecords)
-                self.rdb.incrby("stat:{0}".format(key))
+                self.rdb.incrby("stat:{0}".format(rdbkey))
                 if ln % self.mod_lines == 0:
                     print("  Imported {0} records.".format(ln))
 
@@ -101,6 +136,9 @@ class import_hash:
         print("  Finished, importing {0} records.".format(ln))
 
     def process(self, dataset=False):
+        """Process a dataset
+        :param dataset: The dataset to process
+        """
         if not dataset:
             self.error("no dataset")
 
@@ -115,28 +153,94 @@ class import_hash:
         dataset_file_type = local_dataset[local_dataset.rfind(".")+1:]
 
         if dataset_file_type == "iso":
+            # We read directly from the ISO file
             isofile = pathlab.IsoAccessor(local_dataset)
 
-            self.__process_nsrl_zip(isofile, "NSRLFILE.ZIP", "NSRLAndroid")
-            self.__process_nsrl_txt(isofile, "NSRLMFG.TXT", "MfgCode", "NSRLMfg")
-            self.__process_nsrl_txt(isofile, "NSRLOS.TXT", "OpSystemCode", "NSRLOS")
-            self.__process_nsrl_txt(isofile, "NSRLPROD.TXT", "ProductCode", "NSRLProd")
+            self.__process_nsrl_base(isofile, "NSRLFILE.ZIP", dataset)
+            self.__process_nsrl_support(isofile, "NSRLMFG.TXT", "MfgCode")
+            self.__process_nsrl_support(isofile, "NSRLOS.TXT", "OpSystemCode")
+            self.__process_nsrl_support(isofile, "NSRLPROD.TXT", "ProductCode")
+        elif dataset_file_type == "zip":
+            # Extract the ZIP
+            zip_f = zipfile.ZipFile(local_dataset)
+            zip_f.extractall(self.local_path)
+            # NSRL ZIPs store the datafiles in a subdirectory
+            namelist_first = zip_f.namelist()[0]
+            zip_extract_path = ""
+            if namelist_first[-1] == "/":
+                zip_extract_path = self.local_path + namelist_first
+            # Indicate we don't have an ISO object
+            isofile = False
+
+            self.__process_nsrl_base(isofile, zip_extract_path + "NSRLFile.txt", dataset)
+            self.__process_nsrl_support(isofile, zip_extract_path + "NSRLMfg.txt", "MfgCode")
+            self.__process_nsrl_support(isofile, zip_extract_path + "NSRLOS.txt", "OpSystemCode")
+            self.__process_nsrl_support(isofile, zip_extract_path + "NSRLProd.txt", "ProductCode")
 
     def init(self, dataset=False):
+        """ Remove / Initialize a dataset
+        :param dataset: Affected dataset
+        """
         if not dataset:
             self.error("no dataset")
 
         print("**INIT** dataset {0} .".format(dataset))
 
-        self.rdb.delete("stat:{0}".format(dataset))
-        self.rdb.set("stat:{0}".format(dataset), 0)
+        if self.flushrdb:
+            self.rdb.flushdb()
+        else:
+            self.rdb.delete("stat:{0}".format(dataset))
+            self.rdb.set("stat:{0}".format(dataset), 0)
+
+    def datasetlist(self):
+        """ List the available datasets
+        """
+        for nsrl in self.hash_datasets:
+            print("{0}\n {1}\n from: {2}\n".format(nsrl, self.hash_datasets[nsrl]["description"], self.hash_datasets[nsrl]["url"]))
+
+    def valid_dataset(self, dataset):
+        """ Verify if the datset exist
+        :param dataset: Affected dataset
+        """
+        if dataset in self.hash_datasets:
+            return True
+        else:
+            return False
 
     def error(self, error):
+        """ Return an error message and exit
+        :param error: Error message
+        """
         print("!!ERROR!! {0}".format(error))
         sys.exit()
 
 
+parser = argparse.ArgumentParser()
+group = parser.add_mutually_exclusive_group()
+group.add_argument("-l", "--list", action="store_true", help="List datasets available for download and import.")
+group.add_argument("-i", "--import-dataset", help="Import a dataset.")
+group.add_argument("-e", "--init-dataset", help="Remove / initialize a dataset.")
+parser.add_argument("-d", "--skip-download", action="store_true", help="Skip downloading the dataset.")
+parser.add_argument("-c", "--skip-init", action="store_true", help="Skip initialization of the database.")
+args = parser.parse_args()
+
 import_hash = import_hash()
-#import_hash.download(dataset="nsrl_android")
-import_hash.init(dataset="nsrl_android")
-import_hash.process(dataset="nsrl_android")
+
+if args.list:
+    import_hash.datasetlist()
+elif args.import_dataset:
+    dataset = args.import_dataset
+    if import_hash.valid_dataset(dataset):
+        if not args.skip_download:
+            import_hash.download(dataset=dataset)
+        if not args.skip_init:
+            import_hash.init(dataset=dataset)
+        import_hash.process(dataset=dataset)
+    else:
+        print("Dataset not found.")
+elif args.init_dataset:
+    dataset = args.init_dataset
+    if import_hash.valid_dataset(dataset):
+        import_hash.init(dataset=dataset)
+    else:
+        print("Dataset not found.")