From ca910ff22f2aed70b811039b1ab396789316a374 Mon Sep 17 00:00:00 2001
From: Koen Van Impe <koen.vanimpe@cudeso.be>
Date: Thu, 29 Jul 2021 10:46:59 +0200
Subject: [PATCH] Support for import of NSRL datasets in ISO and ZIP format

---
 bin/import-poc/README.md                   |  30 +++-
 bin/import-poc/config.json                 |  11 +-
 bin/import-poc/import-hashlookup-server.py | 156 +++++++++++++++++----
 3 files changed, 165 insertions(+), 32 deletions(-)

diff --git a/bin/import-poc/README.md b/bin/import-poc/README.md
index d3ee74c..d98e7ac 100644
--- a/bin/import-poc/README.md
+++ b/bin/import-poc/README.md
@@ -2,9 +2,33 @@
 
 PoC to better streamline the import of NSRL data.
 
-Todo:
+## Usage
 
-- Test with the other data sets (currently only Android was tested) : Fetch from ZIP and not ISO file
+```
+$ python3 import-hashlookup-server.py -h
+usage: import-hashlookup-server.py [-h] [-l | -i IMPORT_DATASET | -e INIT_DATASET] [-d] [-c]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -l, --list            List datasets available for download and import.
+  -i IMPORT_DATASET, --import-dataset IMPORT_DATASET
+                        Import a dataset.
+  -e INIT_DATASET, --init-dataset INIT_DATASET
+                        Remove / initialize a dataset.
+  -d, --skip-download   Skip downloading the dataset.
+  -c, --skip-init       Skip initialization of the database.
+```
+
+```
+$ python3 import-hashlookup-server.py -i nsrl_minimal
+```
+
+## Todo
+
+
+- ~~Test with the other data sets (currently only Android was tested) : Fetch from ZIP and not ISO file~~
+- Move older input scripts to "old" directory 
+- Complete with sha256 and xcycl
 - Error handling (sufficient drive space, Redis active, check if there is already a db before init)
 - Multiple data sets at once?
-- Import from MISP (depends on filter)
+- Import from MISP (depends on filter)
\ No newline at end of file
diff --git a/bin/import-poc/config.json b/bin/import-poc/config.json
index 1af0762..1746734 100644
--- a/bin/import-poc/config.json
+++ b/bin/import-poc/config.json
@@ -4,15 +4,20 @@
     "nsrl_minimal":      { "description": "(minimal) - contains the set of DISTINCT appearances of files in  modern applications; no file entries are duplicated",
                         "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernm.zip"},
     "nsrl_android":      { "description": "Contains modern Android mobile applications",
-                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_android.iso"}
+                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_android.iso"},
+    "nsrl_unique":       { "description": "Contains the set of file entries that appear ONLY ONCE  in the entire NSRL collection; these are unique to each off the applications that are in the collection",
+                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernu.zip"},
+    "nsrl_ios":          { "description": "IOS dataset",
+                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_ios.iso"}
     },
   "local_path": "/home/koenv/nsrl/",
   "import": {
       "max_value": 500000000,
-      "mod_lines": 1000
+      "mod_lines": 2500
   },
   "redis": {
       "hostname": "127.0.0.1",
-      "port": 6666
+      "port": 6666,
+      "flushdb_on_init": 1
   }
 }
\ No newline at end of file
diff --git a/bin/import-poc/import-hashlookup-server.py b/bin/import-poc/import-hashlookup-server.py
index 9a6b16e..67c4e72 100644
--- a/bin/import-poc/import-hashlookup-server.py
+++ b/bin/import-poc/import-hashlookup-server.py
@@ -5,6 +5,8 @@ import wget
 import sys
 import redis
 import json
+import time
+import argparse
 
 
 class import_hash:
@@ -17,10 +19,14 @@ class import_hash:
             self.local_path = data["local_path"]
             redis_host = data["redis"]["hostname"]
             redis_port = data["redis"]["port"]
+            self.flushrdb = data["redis"]["flushdb_on_init"]
 
         self.rdb = redis.Redis(host=redis_host, port=redis_port, decode_responses=True)
 
     def download(self, dataset=False):
+        """ Download a dataset
+        :param dataset: The dataset to use. This is a key looked for in the config.json file to get the correct download URL
+        """
         if not dataset:
             self.error("no dataset")
 
@@ -28,12 +34,21 @@ class import_hash:
         wget.download(self.hash_datasets[dataset]["url"], self.local_path)
         print("\nDownload completed.")
 
-    def __process_nsrl_txt(self, isofile, dataset_file, key1, key2):
-
+    def __process_nsrl_support(self, isofile, dataset_file, key):
+        """ Process support NSRL data (OS, Product, Vendor/Manufacturer)
+        :param isofile: The object to the ISO file. When set to False this indicates the NSRL is provided in a ZIP format
+        :param dataset_file: The location of the dataset, is either a path in the ISO or a direct filepath
+        :param key: type of support NSRL data
+        """
         print("\n Work on {0}".format(dataset_file))
 
+        if isofile:
+            with_element = isofile.IsoPath("/{0}".format(dataset_file)).open()
+        else:
+            with_element = open(dataset_file, encoding='utf-8')
+
         ln = 0
-        with isofile.IsoPath("/{0}".format(dataset_file)).open() as f:
+        with with_element as f:
             while True:
                 l = f.readline()
 
@@ -51,9 +66,10 @@ class import_hash:
                         except:
                             continue
 
-                    self.rdb.sadd("s:{0}".format(key1), drecords[key1])
-                    self.rdb.hmset("h-{0}:{1}".format(key1, drecords[key1]), drecords)
-                    self.rdb.incrby("stat:{0}-import".format(key2))
+                    self.rdb.sadd("s:{0}".format(key), drecords[key])
+                    self.rdb.hmset("h-{0}:{1}".format(key, drecords[key]), drecords)
+                    stat_import_key = dataset_file[dataset_file.rfind("/")+1:dataset_file.rfind(".txt")]
+                    self.rdb.incrby("stat:{0}-import".format(stat_import_key))
                     if ln % self.mod_lines == 0:
                         print("  Imported {0} records.".format(ln))
 
@@ -63,19 +79,32 @@ class import_hash:
                 ln = ln + 1
             print("  Finished, importing {0} records.".format(ln))
 
-    def __process_nsrl_zip(self, isofile, dataset_file, key):
+    def __process_nsrl_base(self, isofile, dataset_file, rdbkey):
+        """ Process base NSRL data (file hashes)
+        :param isofile: The object to the ISO file. When set to False this indicates the NSRL is provided in a ZIP format
+        :param dataset_file: The location of the dataset, is either a path in the ISO or a direct filepath
+        :param rdbkey: redis database key (corresponds with key of dataset in config.json)
+        """
         print("\n Work on {0}".format(dataset_file))
 
-        # First get the ZIP from the ISO and then extract the ZIP
-        zip_f = open(self.local_path + dataset_file, "wb")
-        with isofile.IsoPath("/{0}".format(dataset_file)).open("rb") as f:
-            zip_f.write((f.read()))
-        zip_f.close()
-        zip_f = zipfile.ZipFile(self.local_path + dataset_file)
-        zip_f.extractall(self.local_path)
-        
+        if isofile:
+            # We received the NSRL dataset as an ISO file
+            # First get the ZIP from the ISO and then extract the ZIP
+            zip_f = open(self.local_path + dataset_file, "wb")
+            with isofile.IsoPath("/{0}".format(dataset_file)).open("rb") as f:
+                zip_f.write((f.read()))
+            zip_f.close()
+            zip_f = zipfile.ZipFile(self.local_path + dataset_file)
+            zip_f.extractall(self.local_path)
+
+            local_dataset_file = self.local_path + "NSRLFile.txt"
+        else:
+            # No need to do additional actions
+            # We probably received the NSRL as a ZIP file
+            local_dataset_file = dataset_file
+
         ln = 0
-        lines = open(self.local_path + "NSRLFile.txt", "r")
+        lines = open(local_dataset_file, "r")
 
         for l in lines:
             if ln == 0:
@@ -89,9 +118,15 @@ class import_hash:
                     except:
                         continue
 
+                # Add some meta data
+                drecords['source'] = "NSRL"
+                drecords['db'] = rdbkey
+                drecords['insert-timestamp'] = time.time()
+
+                # Base records
                 self.rdb.set("l:{}".format(drecords['MD5']), drecords['SHA-1'])
                 self.rdb.hmset("h:{}".format(drecords['SHA-1']), drecords)
-                self.rdb.incrby("stat:{0}".format(key))
+                self.rdb.incrby("stat:{0}".format(rdbkey))
                 if ln % self.mod_lines == 0:
                     print("  Imported {0} records.".format(ln))
 
@@ -101,6 +136,9 @@ class import_hash:
         print("  Finished, importing {0} records.".format(ln))
 
     def process(self, dataset=False):
+        """Process a dataset
+        :param dataset: The dataset to process
+        """
         if not dataset:
             self.error("no dataset")
 
@@ -115,28 +153,94 @@ class import_hash:
         dataset_file_type = local_dataset[local_dataset.rfind(".")+1:]
 
         if dataset_file_type == "iso":
+            # We read directly from the ISO file
             isofile = pathlab.IsoAccessor(local_dataset)
 
-            self.__process_nsrl_zip(isofile, "NSRLFILE.ZIP", "NSRLAndroid")
-            self.__process_nsrl_txt(isofile, "NSRLMFG.TXT", "MfgCode", "NSRLMfg")
-            self.__process_nsrl_txt(isofile, "NSRLOS.TXT", "OpSystemCode", "NSRLOS")
-            self.__process_nsrl_txt(isofile, "NSRLPROD.TXT", "ProductCode", "NSRLProd")
+            self.__process_nsrl_base(isofile, "NSRLFILE.ZIP", dataset)
+            self.__process_nsrl_support(isofile, "NSRLMFG.TXT", "MfgCode")
+            self.__process_nsrl_support(isofile, "NSRLOS.TXT", "OpSystemCode")
+            self.__process_nsrl_support(isofile, "NSRLPROD.TXT", "ProductCode")
+        elif dataset_file_type == "zip":
+            # Extract the ZIP
+            zip_f = zipfile.ZipFile(local_dataset)
+            zip_f.extractall(self.local_path)
+            # NSRL ZIPs store the datafiles in a subdirectory
+            namelist_first = zip_f.namelist()[0]
+            zip_extract_path = ""
+            if namelist_first[-1] == "/":
+                zip_extract_path = self.local_path + namelist_first
+            # Indicate we don't have an ISO object
+            isofile = False
+
+            self.__process_nsrl_base(isofile, zip_extract_path + "NSRLFile.txt", dataset)
+            self.__process_nsrl_support(isofile, zip_extract_path + "NSRLMfg.txt", "MfgCode")
+            self.__process_nsrl_support(isofile, zip_extract_path + "NSRLOS.txt", "OpSystemCode")
+            self.__process_nsrl_support(isofile, zip_extract_path + "NSRLProd.txt", "ProductCode")
 
     def init(self, dataset=False):
+        """ Remove / Initialize a dataset
+        :param dataset: Affected dataset
+        """
         if not dataset:
             self.error("no dataset")
 
         print("**INIT** dataset {0} .".format(dataset))
 
-        self.rdb.delete("stat:{0}".format(dataset))
-        self.rdb.set("stat:{0}".format(dataset), 0)
+        if self.flushrdb:
+            self.rdb.flushdb()
+        else:
+            self.rdb.delete("stat:{0}".format(dataset))
+            self.rdb.set("stat:{0}".format(dataset), 0)
+
+    def datasetlist(self):
+        """ List the available datasets
+        """
+        for nsrl in self.hash_datasets:
+            print("{0}\n {1}\n from: {2}\n".format(nsrl, self.hash_datasets[nsrl]["description"], self.hash_datasets[nsrl]["url"]))
+
+    def valid_dataset(self, dataset):
+        """ Verify if the datset exist
+        :param dataset: Affected dataset
+        """
+        if dataset in self.hash_datasets:
+            return True
+        else:
+            return False
 
     def error(self, error):
+        """ Return an error message and exit
+        :param error: Error message
+        """
         print("!!ERROR!! {0}".format(error))
         sys.exit()
 
 
+parser = argparse.ArgumentParser()
+group = parser.add_mutually_exclusive_group()
+group.add_argument("-l", "--list", action="store_true", help="List datasets available for download and import.")
+group.add_argument("-i", "--import-dataset", help="Import a dataset.")
+group.add_argument("-e", "--init-dataset", help="Remove / initialize a dataset.")
+parser.add_argument("-d", "--skip-download", action="store_true", help="Skip downloading the dataset.")
+parser.add_argument("-c", "--skip-init", action="store_true", help="Skip initialization of the database.")
+args = parser.parse_args()
+
 import_hash = import_hash()
-#import_hash.download(dataset="nsrl_android")
-import_hash.init(dataset="nsrl_android")
-import_hash.process(dataset="nsrl_android")
+
+if args.list:
+    import_hash.datasetlist()
+elif args.import_dataset:
+    dataset = args.import_dataset
+    if import_hash.valid_dataset(dataset):
+        if not args.skip_download:
+            import_hash.download(dataset=dataset)
+        if not args.skip_init:
+            import_hash.init(dataset=dataset)
+        import_hash.process(dataset=dataset)
+    else:
+        print("Dataset not found.")
+elif args.init_dataset:
+    dataset = args.init_dataset
+    if import_hash.valid_dataset(dataset):
+        import_hash.init(dataset=dataset)
+    else:
+        print("Dataset not found.")