Support for import of NSRL datasets in ISO and ZIP format

2024-11-22 10:07:11 +00:00 · 2021-07-29 10:46:59 +02:00 · 2021-07-29 10:46:59 +02:00 · ca910ff22f
commit ca910ff22f
parent d280216361
3 changed files with 165 additions and 32 deletions
--- a/bin/import-poc/README.md
+++ b/bin/import-poc/README.md
@ -2,9 +2,33 @@
 PoC to better streamline the import of NSRL data.
-Todo:
+## Usage
- Test with the other data sets (currently only Android was tested) : Fetch from ZIP and not ISO file
+```
 $ python3 import-hashlookup-server.py -h
 usage: import-hashlookup-server.py [-h] [-l | -i IMPORT_DATASET | -e INIT_DATASET] [-d] [-c]
 optional arguments:
  -h, --help            show this help message and exit
  -l, --list            List datasets available for download and import.
  -i IMPORT_DATASET, --import-dataset IMPORT_DATASET
                        Import a dataset.
  -e INIT_DATASET, --init-dataset INIT_DATASET
                        Remove / initialize a dataset.
  -d, --skip-download   Skip downloading the dataset.
  -c, --skip-init       Skip initialization of the database.
 ```
 ```
 $ python3 import-hashlookup-server.py -i nsrl_minimal
 ```
 ## Todo
 - ~~Test with the other data sets (currently only Android was tested) : Fetch from ZIP and not ISO file~~
 - Move older input scripts to "old" directory 
 - Complete with sha256 and xcycl
 - Error handling (sufficient drive space, Redis active, check if there is already a db before init)
 - Multiple data sets at once?
 - Import from MISP (depends on filter)
--- a/bin/import-poc/config.json
+++ b/bin/import-poc/config.json
@ -4,15 +4,20 @@
    "nsrl_minimal":      { "description": "(minimal) - contains the set of DISTINCT appearances of files in  modern applications; no file entries are duplicated",
                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernm.zip"},
    "nsrl_android":      { "description": "Contains modern Android mobile applications",
-                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_android.iso"}
+                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_android.iso"},
    "nsrl_unique":       { "description": "Contains the set of file entries that appear ONLY ONCE  in the entire NSRL collection; these are unique to each off the applications that are in the collection",
                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernu.zip"},
    "nsrl_ios":          { "description": "IOS dataset",
                        "url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_ios.iso"}
    },
  "local_path": "/home/koenv/nsrl/",
  "import": {
      "max_value": 500000000,
-      "mod_lines": 1000
+      "mod_lines": 2500
  },
  "redis": {
      "hostname": "127.0.0.1",
-      "port": 6666
+      "port": 6666,
      "flushdb_on_init": 1
  }
 }
--- a/bin/import-poc/import-hashlookup-server.py
+++ b/bin/import-poc/import-hashlookup-server.py
@ -5,6 +5,8 @@ import wget
 import sys
 import redis
 import json
 import time
 import argparse
 class import_hash:
@ -17,10 +19,14 @@ class import_hash:
            self.local_path = data["local_path"]
            redis_host = data["redis"]["hostname"]
            redis_port = data["redis"]["port"]
            self.flushrdb = data["redis"]["flushdb_on_init"]
        self.rdb = redis.Redis(host=redis_host, port=redis_port, decode_responses=True)
    def download(self, dataset=False):
        """ Download a dataset
        :param dataset: The dataset to use. This is a key looked for in the config.json file to get the correct download URL
        """
        if not dataset:
            self.error("no dataset")
@ -28,12 +34,21 @@ class import_hash:
        wget.download(self.hash_datasets[dataset]["url"], self.local_path)
        print("\nDownload completed.")
-    def __process_nsrl_txt(self, isofile, dataset_file, key1, key2):
+    def __process_nsrl_support(self, isofile, dataset_file, key):
-
+        """ Process support NSRL data (OS, Product, Vendor/Manufacturer)
        :param isofile: The object to the ISO file. When set to False this indicates the NSRL is provided in a ZIP format
        :param dataset_file: The location of the dataset, is either a path in the ISO or a direct filepath
        :param key: type of support NSRL data
        """
        print("\n Work on {0}".format(dataset_file))
        if isofile:
            with_element = isofile.IsoPath("/{0}".format(dataset_file)).open()
        else:
            with_element = open(dataset_file, encoding='utf-8')
        ln = 0
-        with isofile.IsoPath("/{0}".format(dataset_file)).open() as f:
+        with with_element as f:
            while True:
                l = f.readline()
@ -51,9 +66,10 @@ class import_hash:
                        except:
                            continue
-                    self.rdb.sadd("s:{0}".format(key1), drecords[key1])
+                    self.rdb.sadd("s:{0}".format(key), drecords[key])
-                    self.rdb.hmset("h-{0}:{1}".format(key1, drecords[key1]), drecords)
+                    self.rdb.hmset("h-{0}:{1}".format(key, drecords[key]), drecords)
-                    self.rdb.incrby("stat:{0}-import".format(key2))
+                    stat_import_key = dataset_file[dataset_file.rfind("/")+1:dataset_file.rfind(".txt")]
                    self.rdb.incrby("stat:{0}-import".format(stat_import_key))
                    if ln % self.mod_lines == 0:
                        print("  Imported {0} records.".format(ln))
@ -63,9 +79,16 @@ class import_hash:
                ln = ln + 1
            print("  Finished, importing {0} records.".format(ln))
-    def __process_nsrl_zip(self, isofile, dataset_file, key):
+    def __process_nsrl_base(self, isofile, dataset_file, rdbkey):
        """ Process base NSRL data (file hashes)
        :param isofile: The object to the ISO file. When set to False this indicates the NSRL is provided in a ZIP format
        :param dataset_file: The location of the dataset, is either a path in the ISO or a direct filepath
        :param rdbkey: redis database key (corresponds with key of dataset in config.json)
        """
        print("\n Work on {0}".format(dataset_file))
        if isofile:
            # We received the NSRL dataset as an ISO file
            # First get the ZIP from the ISO and then extract the ZIP
            zip_f = open(self.local_path + dataset_file, "wb")
            with isofile.IsoPath("/{0}".format(dataset_file)).open("rb") as f:
@ -74,8 +97,14 @@ class import_hash:
            zip_f = zipfile.ZipFile(self.local_path + dataset_file)
            zip_f.extractall(self.local_path)
            local_dataset_file = self.local_path + "NSRLFile.txt"
        else:
            # No need to do additional actions
            # We probably received the NSRL as a ZIP file
            local_dataset_file = dataset_file
        ln = 0
-        lines = open(self.local_path + "NSRLFile.txt", "r")
+        lines = open(local_dataset_file, "r")
        for l in lines:
            if ln == 0:
@ -89,9 +118,15 @@ class import_hash:
                    except:
                        continue
                # Add some meta data
                drecords['source'] = "NSRL"
                drecords['db'] = rdbkey
                drecords['insert-timestamp'] = time.time()
                # Base records
                self.rdb.set("l:{}".format(drecords['MD5']), drecords['SHA-1'])
                self.rdb.hmset("h:{}".format(drecords['SHA-1']), drecords)
-                self.rdb.incrby("stat:{0}".format(key))
+                self.rdb.incrby("stat:{0}".format(rdbkey))
                if ln % self.mod_lines == 0:
                    print("  Imported {0} records.".format(ln))
@ -101,6 +136,9 @@ class import_hash:
        print("  Finished, importing {0} records.".format(ln))
    def process(self, dataset=False):
        """Process a dataset
        :param dataset: The dataset to process
        """
        if not dataset:
            self.error("no dataset")
@ -115,28 +153,94 @@ class import_hash:
        dataset_file_type = local_dataset[local_dataset.rfind(".")+1:]
        if dataset_file_type == "iso":
            # We read directly from the ISO file
            isofile = pathlab.IsoAccessor(local_dataset)
-            self.__process_nsrl_zip(isofile, "NSRLFILE.ZIP", "NSRLAndroid")
+            self.__process_nsrl_base(isofile, "NSRLFILE.ZIP", dataset)
-            self.__process_nsrl_txt(isofile, "NSRLMFG.TXT", "MfgCode", "NSRLMfg")
+            self.__process_nsrl_support(isofile, "NSRLMFG.TXT", "MfgCode")
-            self.__process_nsrl_txt(isofile, "NSRLOS.TXT", "OpSystemCode", "NSRLOS")
+            self.__process_nsrl_support(isofile, "NSRLOS.TXT", "OpSystemCode")
-            self.__process_nsrl_txt(isofile, "NSRLPROD.TXT", "ProductCode", "NSRLProd")
+            self.__process_nsrl_support(isofile, "NSRLPROD.TXT", "ProductCode")
        elif dataset_file_type == "zip":
            # Extract the ZIP
            zip_f = zipfile.ZipFile(local_dataset)
            zip_f.extractall(self.local_path)
            # NSRL ZIPs store the datafiles in a subdirectory
            namelist_first = zip_f.namelist()[0]
            zip_extract_path = ""
            if namelist_first[-1] == "/":
                zip_extract_path = self.local_path + namelist_first
            # Indicate we don't have an ISO object
            isofile = False
            self.__process_nsrl_base(isofile, zip_extract_path + "NSRLFile.txt", dataset)
            self.__process_nsrl_support(isofile, zip_extract_path + "NSRLMfg.txt", "MfgCode")
            self.__process_nsrl_support(isofile, zip_extract_path + "NSRLOS.txt", "OpSystemCode")
            self.__process_nsrl_support(isofile, zip_extract_path + "NSRLProd.txt", "ProductCode")
    def init(self, dataset=False):
        """ Remove / Initialize a dataset
        :param dataset: Affected dataset
        """
        if not dataset:
            self.error("no dataset")
        print("**INIT** dataset {0} .".format(dataset))
        if self.flushrdb:
            self.rdb.flushdb()
        else:
            self.rdb.delete("stat:{0}".format(dataset))
            self.rdb.set("stat:{0}".format(dataset), 0)
    def datasetlist(self):
        """ List the available datasets
        """
        for nsrl in self.hash_datasets:
            print("{0}\n {1}\n from: {2}\n".format(nsrl, self.hash_datasets[nsrl]["description"], self.hash_datasets[nsrl]["url"]))
    def valid_dataset(self, dataset):
        """ Verify if the datset exist
        :param dataset: Affected dataset
        """
        if dataset in self.hash_datasets:
            return True
        else:
            return False
    def error(self, error):
        """ Return an error message and exit
        :param error: Error message
        """
        print("!!ERROR!! {0}".format(error))
        sys.exit()
 parser = argparse.ArgumentParser()
 group = parser.add_mutually_exclusive_group()
 group.add_argument("-l", "--list", action="store_true", help="List datasets available for download and import.")
 group.add_argument("-i", "--import-dataset", help="Import a dataset.")
 group.add_argument("-e", "--init-dataset", help="Remove / initialize a dataset.")
 parser.add_argument("-d", "--skip-download", action="store_true", help="Skip downloading the dataset.")
 parser.add_argument("-c", "--skip-init", action="store_true", help="Skip initialization of the database.")
 args = parser.parse_args()
 import_hash = import_hash()
-#import_hash.download(dataset="nsrl_android")
+
-import_hash.init(dataset="nsrl_android")
+if args.list:
-import_hash.process(dataset="nsrl_android")
+    import_hash.datasetlist()
 elif args.import_dataset:
    dataset = args.import_dataset
    if import_hash.valid_dataset(dataset):
        if not args.skip_download:
            import_hash.download(dataset=dataset)
        if not args.skip_init:
            import_hash.init(dataset=dataset)
        import_hash.process(dataset=dataset)
    else:
        print("Dataset not found.")
 elif args.init_dataset:
    dataset = args.init_dataset
    if import_hash.valid_dataset(dataset):
        import_hash.init(dataset=dataset)
    else:
        print("Dataset not found.")