Support for import of NSRL datasets in ISO and ZIP format

2024-12-22 08:45:58 +00:00 · 2021-07-29 10:46:59 +02:00 · 2021-07-29 10:46:59 +02:00 · ca910ff22f
commit ca910ff22f
parent d280216361
3 changed files with 165 additions and 32 deletions
--- a/bin/import-poc/README.md
+++ b/bin/import-poc/README.md
@ -2,9 +2,33 @@

 PoC to better streamline the import of NSRL data.

-Todo:
+## Usage

- Test with the other data sets (currently only Android was tested) : Fetch from ZIP and not ISO file
+```
+$ python3 import-hashlookup-server.py -h
+usage: import-hashlookup-server.py [-h] [-l | -i IMPORT_DATASET | -e INIT_DATASET] [-d] [-c]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -l, --list            List datasets available for download and import.
+  -i IMPORT_DATASET, --import-dataset IMPORT_DATASET
+                        Import a dataset.
+  -e INIT_DATASET, --init-dataset INIT_DATASET
+                        Remove / initialize a dataset.
+  -d, --skip-download   Skip downloading the dataset.
+  -c, --skip-init       Skip initialization of the database.
+```
+
+```
+$ python3 import-hashlookup-server.py -i nsrl_minimal
+```
+
+## Todo
+
+
+- ~~Test with the other data sets (currently only Android was tested) : Fetch from ZIP and not ISO file~~
+- Move older input scripts to "old" directory 
+- Complete with sha256 and xcycl
 - Error handling (sufficient drive space, Redis active, check if there is already a db before init)
 - Multiple data sets at once?
- Import from MISP (depends on filter)
+- Import from MISP (depends on filter)
--- a/bin/import-poc/config.json
+++ b/bin/import-poc/config.json
@ -4,15 +4,20 @@
    "nsrl_minimal":      { "description": "(minimal) - contains the set of DISTINCT appearances of files in  modern applications; no file entries are duplicated",
                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernm.zip"},
    "nsrl_android":      { "description": "Contains modern Android mobile applications",
-                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_android.iso"}
+                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_android.iso"},
+    "nsrl_unique":       { "description": "Contains the set of file entries that appear ONLY ONCE  in the entire NSRL collection; these are unique to each off the applications that are in the collection",
+                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernu.zip"},
+    "nsrl_ios":          { "description": "IOS dataset",
+                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_ios.iso"}
    },
  "local_path": "/home/koenv/nsrl/",
  "import": {
      "max_value": 500000000,
-      "mod_lines": 1000
+      "mod_lines": 2500
  },
  "redis": {
      "hostname": "127.0.0.1",
-      "port": 6666
+      "port": 6666,
+      "flushdb_on_init": 1
  }
 }
--- a/bin/import-poc/import-hashlookup-server.py
+++ b/bin/import-poc/import-hashlookup-server.py
@ -5,6 +5,8 @@ import wget
 import sys
 import redis
 import json
+import time
+import argparse


 class import_hash:
@ -17,10 +19,14 @@ class import_hash:
            self.local_path = data["local_path"]
            redis_host = data["redis"]["hostname"]
            redis_port = data["redis"]["port"]
+            self.flushrdb = data["redis"]["flushdb_on_init"]

        self.rdb = redis.Redis(host=redis_host, port=redis_port, decode_responses=True)

    def download(self, dataset=False):
+        """ Download a dataset
+        :param dataset: The dataset to use. This is a key looked for in the config.json file to get the correct download URL
+        """
        if not dataset:
            self.error("no dataset")

@ -28,12 +34,21 @@ class import_hash:
        wget.download(self.hash_datasets[dataset]["url"], self.local_path)
        print("\nDownload completed.")

-    def __process_nsrl_txt(self, isofile, dataset_file, key1, key2):
-
+    def __process_nsrl_support(self, isofile, dataset_file, key):
+        """ Process support NSRL data (OS, Product, Vendor/Manufacturer)
+        :param isofile: The object to the ISO file. When set to False this indicates the NSRL is provided in a ZIP format
+        :param dataset_file: The location of the dataset, is either a path in the ISO or a direct filepath
+        :param key: type of support NSRL data
+        """
        print("\n Work on {0}".format(dataset_file))

+        if isofile:
+            with_element = isofile.IsoPath("/{0}".format(dataset_file)).open()
+        else:
+            with_element = open(dataset_file, encoding='utf-8')
+
        ln = 0
-        with isofile.IsoPath("/{0}".format(dataset_file)).open() as f:
+        with with_element as f:
            while True:
                l = f.readline()

@ -51,9 +66,10 @@ class import_hash:
                        except:
                            continue

-                    self.rdb.sadd("s:{0}".format(key1), drecords[key1])
-                    self.rdb.hmset("h-{0}:{1}".format(key1, drecords[key1]), drecords)
-                    self.rdb.incrby("stat:{0}-import".format(key2))
+                    self.rdb.sadd("s:{0}".format(key), drecords[key])
+                    self.rdb.hmset("h-{0}:{1}".format(key, drecords[key]), drecords)
+                    stat_import_key = dataset_file[dataset_file.rfind("/")+1:dataset_file.rfind(".txt")]
+                    self.rdb.incrby("stat:{0}-import".format(stat_import_key))
                    if ln % self.mod_lines == 0:
                        print("  Imported {0} records.".format(ln))

@ -63,19 +79,32 @@ class import_hash:
                ln = ln + 1
            print("  Finished, importing {0} records.".format(ln))

-    def __process_nsrl_zip(self, isofile, dataset_file, key):
+    def __process_nsrl_base(self, isofile, dataset_file, rdbkey):
+        """ Process base NSRL data (file hashes)
+        :param isofile: The object to the ISO file. When set to False this indicates the NSRL is provided in a ZIP format
+        :param dataset_file: The location of the dataset, is either a path in the ISO or a direct filepath
+        :param rdbkey: redis database key (corresponds with key of dataset in config.json)
+        """
        print("\n Work on {0}".format(dataset_file))

-        # First get the ZIP from the ISO and then extract the ZIP
-        zip_f = open(self.local_path + dataset_file, "wb")
-        with isofile.IsoPath("/{0}".format(dataset_file)).open("rb") as f:
-            zip_f.write((f.read()))
-        zip_f.close()
-        zip_f = zipfile.ZipFile(self.local_path + dataset_file)
-        zip_f.extractall(self.local_path)
-        
+        if isofile:
+            # We received the NSRL dataset as an ISO file
+            # First get the ZIP from the ISO and then extract the ZIP
+            zip_f = open(self.local_path + dataset_file, "wb")
+            with isofile.IsoPath("/{0}".format(dataset_file)).open("rb") as f:
+                zip_f.write((f.read()))
+            zip_f.close()
+            zip_f = zipfile.ZipFile(self.local_path + dataset_file)
+            zip_f.extractall(self.local_path)
+
+            local_dataset_file = self.local_path + "NSRLFile.txt"
+        else:
+            # No need to do additional actions
+            # We probably received the NSRL as a ZIP file
+            local_dataset_file = dataset_file
+
        ln = 0
-        lines = open(self.local_path + "NSRLFile.txt", "r")
+        lines = open(local_dataset_file, "r")

        for l in lines:
            if ln == 0:
@ -89,9 +118,15 @@ class import_hash:
                    except:
                        continue

+                # Add some meta data
+                drecords['source'] = "NSRL"
+                drecords['db'] = rdbkey
+                drecords['insert-timestamp'] = time.time()
+
+                # Base records
                self.rdb.set("l:{}".format(drecords['MD5']), drecords['SHA-1'])
                self.rdb.hmset("h:{}".format(drecords['SHA-1']), drecords)
-                self.rdb.incrby("stat:{0}".format(key))
+                self.rdb.incrby("stat:{0}".format(rdbkey))
                if ln % self.mod_lines == 0:
                    print("  Imported {0} records.".format(ln))

@ -101,6 +136,9 @@ class import_hash:
        print("  Finished, importing {0} records.".format(ln))

    def process(self, dataset=False):
+        """Process a dataset
+        :param dataset: The dataset to process
+        """
        if not dataset:
            self.error("no dataset")

@ -115,28 +153,94 @@ class import_hash:
        dataset_file_type = local_dataset[local_dataset.rfind(".")+1:]

        if dataset_file_type == "iso":
+            # We read directly from the ISO file
            isofile = pathlab.IsoAccessor(local_dataset)

-            self.__process_nsrl_zip(isofile, "NSRLFILE.ZIP", "NSRLAndroid")
-            self.__process_nsrl_txt(isofile, "NSRLMFG.TXT", "MfgCode", "NSRLMfg")
-            self.__process_nsrl_txt(isofile, "NSRLOS.TXT", "OpSystemCode", "NSRLOS")
-            self.__process_nsrl_txt(isofile, "NSRLPROD.TXT", "ProductCode", "NSRLProd")
+            self.__process_nsrl_base(isofile, "NSRLFILE.ZIP", dataset)
+            self.__process_nsrl_support(isofile, "NSRLMFG.TXT", "MfgCode")
+            self.__process_nsrl_support(isofile, "NSRLOS.TXT", "OpSystemCode")
+            self.__process_nsrl_support(isofile, "NSRLPROD.TXT", "ProductCode")
+        elif dataset_file_type == "zip":
+            # Extract the ZIP
+            zip_f = zipfile.ZipFile(local_dataset)
+            zip_f.extractall(self.local_path)
+            # NSRL ZIPs store the datafiles in a subdirectory
+            namelist_first = zip_f.namelist()[0]
+            zip_extract_path = ""
+            if namelist_first[-1] == "/":
+                zip_extract_path = self.local_path + namelist_first
+            # Indicate we don't have an ISO object
+            isofile = False
+
+            self.__process_nsrl_base(isofile, zip_extract_path + "NSRLFile.txt", dataset)
+            self.__process_nsrl_support(isofile, zip_extract_path + "NSRLMfg.txt", "MfgCode")
+            self.__process_nsrl_support(isofile, zip_extract_path + "NSRLOS.txt", "OpSystemCode")
+            self.__process_nsrl_support(isofile, zip_extract_path + "NSRLProd.txt", "ProductCode")

    def init(self, dataset=False):
+        """ Remove / Initialize a dataset
+        :param dataset: Affected dataset
+        """
        if not dataset:
            self.error("no dataset")

        print("**INIT** dataset {0} .".format(dataset))

-        self.rdb.delete("stat:{0}".format(dataset))
-        self.rdb.set("stat:{0}".format(dataset), 0)
+        if self.flushrdb:
+            self.rdb.flushdb()
+        else:
+            self.rdb.delete("stat:{0}".format(dataset))
+            self.rdb.set("stat:{0}".format(dataset), 0)
+
+    def datasetlist(self):
+        """ List the available datasets
+        """
+        for nsrl in self.hash_datasets:
+            print("{0}\n {1}\n from: {2}\n".format(nsrl, self.hash_datasets[nsrl]["description"], self.hash_datasets[nsrl]["url"]))
+
+    def valid_dataset(self, dataset):
+        """ Verify if the datset exist
+        :param dataset: Affected dataset
+        """
+        if dataset in self.hash_datasets:
+            return True
+        else:
+            return False

    def error(self, error):
+        """ Return an error message and exit
+        :param error: Error message
+        """
        print("!!ERROR!! {0}".format(error))
        sys.exit()


+parser = argparse.ArgumentParser()
+group = parser.add_mutually_exclusive_group()
+group.add_argument("-l", "--list", action="store_true", help="List datasets available for download and import.")
+group.add_argument("-i", "--import-dataset", help="Import a dataset.")
+group.add_argument("-e", "--init-dataset", help="Remove / initialize a dataset.")
+parser.add_argument("-d", "--skip-download", action="store_true", help="Skip downloading the dataset.")
+parser.add_argument("-c", "--skip-init", action="store_true", help="Skip initialization of the database.")
+args = parser.parse_args()
+
 import_hash = import_hash()
-#import_hash.download(dataset="nsrl_android")
-import_hash.init(dataset="nsrl_android")
-import_hash.process(dataset="nsrl_android")
+
+if args.list:
+    import_hash.datasetlist()
+elif args.import_dataset:
+    dataset = args.import_dataset
+    if import_hash.valid_dataset(dataset):
+        if not args.skip_download:
+            import_hash.download(dataset=dataset)
+        if not args.skip_init:
+            import_hash.init(dataset=dataset)
+        import_hash.process(dataset=dataset)
+    else:
+        print("Dataset not found.")
+elif args.init_dataset:
+    dataset = args.init_dataset
+    if import_hash.valid_dataset(dataset):
+        import_hash.init(dataset=dataset)
+    else:
+        print("Dataset not found.")