hashlookup-server/bin/import-poc/import-hashlookup-server.py

from pathlib import Path
import pathlab
import zipfile
import wget
import sys
import redis
import json
import time
import argparse


class import_hash:
    def __init__(self):
        with open('config.json') as config_file:
            data = json.load(config_file)
            self.hash_datasets = data["nsrl_downloads"]
            self.max_value = data["import"]["max_value"]
            self.mod_lines = data["import"]["mod_lines"]
            self.local_path = data["local_path"]
            redis_host = data["redis"]["hostname"]
            redis_port = data["redis"]["port"]
            self.flushrdb = data["redis"]["flushdb_on_init"]

        self.rdb = redis.Redis(host=redis_host, port=redis_port, decode_responses=True)

    def download(self, dataset=False):
        """ Download a dataset
        :param dataset: The dataset to use. This is a key looked for in the config.json file to get the correct download URL
        """
        if not dataset:
            self.error("no dataset")

        print("**DOWNLOAD** dataset {0} from {1} to {2} .".format(dataset, self.hash_datasets[dataset]["url"], self.local_path))
        wget.download(self.hash_datasets[dataset]["url"], self.local_path)
        print("\nDownload completed.")

    def __process_nsrl_support(self, isofile, dataset_file, key):
        """ Process support NSRL data (OS, Product, Vendor/Manufacturer)
        :param isofile: The object to the ISO file. When set to False this indicates the NSRL is provided in a ZIP format
        :param dataset_file: The location of the dataset, is either a path in the ISO or a direct filepath
        :param key: type of support NSRL data
        """
        print("\n Work on {0}".format(dataset_file))

        if isofile:
            with_element = isofile.IsoPath("/{0}".format(dataset_file)).open()
        else:
            with_element = open(dataset_file, encoding='utf-8')

        ln = 0
        with with_element as f:
            while True:
                l = f.readline()

                if not l:
                    break

                if ln == 0:
                    headers = l.rstrip().replace("\"", "").split(",")
                else:
                    records = l.rstrip().replace("\"", "").split(",")
                    drecords = {}
                    for index, value in enumerate(records):
                        try:
                            drecords[headers[index]] = value
                        except:
                            continue

                    self.rdb.sadd("s:{0}".format(key), drecords[key])
                    self.rdb.hmset("h-{0}:{1}".format(key, drecords[key]), drecords)
                    stat_import_key = dataset_file[dataset_file.rfind("/")+1:dataset_file.rfind(".txt")]
                    self.rdb.incrby("stat:{0}-import".format(stat_import_key))
                    if ln % self.mod_lines == 0:
                        print("  Imported {0} records.".format(ln))

                if ln == self.max_value:
                    break

                ln = ln + 1
            print("  Finished, importing {0} records.".format(ln))

    def __process_nsrl_base(self, isofile, dataset_file, rdbkey):
        """ Process base NSRL data (file hashes)
        :param isofile: The object to the ISO file. When set to False this indicates the NSRL is provided in a ZIP format
        :param dataset_file: The location of the dataset, is either a path in the ISO or a direct filepath
        :param rdbkey: redis database key (corresponds with key of dataset in config.json)
        """
        print("\n Work on {0}".format(dataset_file))

        if isofile:
            # We received the NSRL dataset as an ISO file
            # First get the ZIP from the ISO and then extract the ZIP
            zip_f = open(self.local_path + dataset_file, "wb")
            with isofile.IsoPath("/{0}".format(dataset_file)).open("rb") as f:
                zip_f.write((f.read()))
            zip_f.close()
            zip_f = zipfile.ZipFile(self.local_path + dataset_file)
            zip_f.extractall(self.local_path)

            local_dataset_file = self.local_path + "NSRLFile.txt"
        else:
            # No need to do additional actions
            # We probably received the NSRL as a ZIP file
            local_dataset_file = dataset_file

        ln = 0
        lines = open(local_dataset_file, "r")

        for l in lines:
            if ln == 0:
                headers = l.rstrip().replace("\"", "").split(",")
            else:
                records = l.rstrip().replace("\"", "").split(",")
                drecords = {}
                for index, value in enumerate(records):
                    try:
                        drecords[headers[index]] = value
                    except:
                        continue

                # Add some meta data
                drecords['source'] = "NSRL"
                drecords['db'] = rdbkey
                drecords['insert-timestamp'] = time.time()

                # Base records
                self.rdb.set("l:{}".format(drecords['MD5']), drecords['SHA-1'])
                self.rdb.hmset("h:{}".format(drecords['SHA-1']), drecords)
                self.rdb.incrby("stat:{0}".format(rdbkey))
                if ln % self.mod_lines == 0:
                    print("  Imported {0} records.".format(ln))

            if ln == self.max_value:
                break
            ln = ln + 1
        print("  Finished, importing {0} records.".format(ln))

    def process(self, dataset=False):
        """Process a dataset
        :param dataset: The dataset to process
        """
        if not dataset:
            self.error("no dataset")

        local_dataset = self.local_path + self.hash_datasets[dataset]["url"][self.hash_datasets[dataset]["url"].rfind("/")+1:]
        local_dataset.lower()
        print("**PROCESS** dataset {0} from location {1} .".format(dataset, local_dataset))

        if not Path(local_dataset).is_file():
            self.error("Cannot find file {0}".format(local_dataset))

        # Determine dataset file type
        dataset_file_type = local_dataset[local_dataset.rfind(".")+1:]

        if dataset_file_type == "iso":
            # We read directly from the ISO file
            isofile = pathlab.IsoAccessor(local_dataset)

            self.__process_nsrl_base(isofile, "NSRLFILE.ZIP", dataset)
            self.__process_nsrl_support(isofile, "NSRLMFG.TXT", "MfgCode")
            self.__process_nsrl_support(isofile, "NSRLOS.TXT", "OpSystemCode")
            self.__process_nsrl_support(isofile, "NSRLPROD.TXT", "ProductCode")
        elif dataset_file_type == "zip":
            # Extract the ZIP
            zip_f = zipfile.ZipFile(local_dataset)
            zip_f.extractall(self.local_path)
            # NSRL ZIPs store the datafiles in a subdirectory
            namelist_first = zip_f.namelist()[0]
            zip_extract_path = ""
            if namelist_first[-1] == "/":
                zip_extract_path = self.local_path + namelist_first
            # Indicate we don't have an ISO object
            isofile = False

            self.__process_nsrl_base(isofile, zip_extract_path + "NSRLFile.txt", dataset)
            self.__process_nsrl_support(isofile, zip_extract_path + "NSRLMfg.txt", "MfgCode")
            self.__process_nsrl_support(isofile, zip_extract_path + "NSRLOS.txt", "OpSystemCode")
            self.__process_nsrl_support(isofile, zip_extract_path + "NSRLProd.txt", "ProductCode")

    def init(self, dataset=False):
        """ Remove / Initialize a dataset
        :param dataset: Affected dataset
        """
        if not dataset:
            self.error("no dataset")

        print("**INIT** dataset {0} .".format(dataset))

        if self.flushrdb:
            pass 
        else:
            self.rdb.delete("stat:{0}".format(dataset))
            self.rdb.set("stat:{0}".format(dataset), 0)

    def datasetlist(self):
        """ List the available datasets
        """
        for nsrl in self.hash_datasets:
            print("{0}\n {1}\n from: {2}\n".format(nsrl, self.hash_datasets[nsrl]["description"], self.hash_datasets[nsrl]["url"]))

    def valid_dataset(self, dataset):
        """ Verify if the datset exist
        :param dataset: Affected dataset
        """
        if dataset in self.hash_datasets:
            return True
        else:
            return False

    def error(self, error):
        """ Return an error message and exit
        :param error: Error message
        """
        print("!!ERROR!! {0}".format(error))
        sys.exit()


parser = argparse.ArgumentParser()
group = parser.add_mutually_exclusive_group()
group.add_argument("-l", "--list", action="store_true", help="List datasets available for download and import.")
group.add_argument("-i", "--import-dataset", help="Import a dataset.")
group.add_argument("-e", "--init-dataset", help="Remove / initialize a dataset.")
parser.add_argument("-d", "--skip-download", action="store_true", help="Skip downloading the dataset.")
parser.add_argument("-c", "--skip-init", action="store_true", help="Skip initialization of the database.")
args = parser.parse_args()

import_hash = import_hash()

if args.list:
    import_hash.datasetlist()
elif args.import_dataset:
    dataset = args.import_dataset
    if import_hash.valid_dataset(dataset):
        if not args.skip_download:
            import_hash.download(dataset=dataset)
        if not args.skip_init:
            import_hash.init(dataset=dataset)
        import_hash.process(dataset=dataset)
    else:
        print("Dataset not found.")
elif args.init_dataset:
    dataset = args.init_dataset
    if import_hash.valid_dataset(dataset):
        import_hash.init(dataset=dataset)
    else:
        print("Dataset not found.")
PoC for streamlining import PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android 2021-07-27 16:37:12 +00:00			`from pathlib import Path`
			`import pathlab`
			`import zipfile`
			`import wget`
			`import sys`
			`import redis`
			`import json`
Support for import of NSRL datasets in ISO and ZIP format 2021-07-29 08:46:59 +00:00			`import time`
			`import argparse`
PoC for streamlining import PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android 2021-07-27 16:37:12 +00:00

			`class import_hash:`
			`def __init__(self):`
			`with open('config.json') as config_file:`
			`data = json.load(config_file)`
			`self.hash_datasets = data["nsrl_downloads"]`
			`self.max_value = data["import"]["max_value"]`
			`self.mod_lines = data["import"]["mod_lines"]`
			`self.local_path = data["local_path"]`
			`redis_host = data["redis"]["hostname"]`
			`redis_port = data["redis"]["port"]`
Support for import of NSRL datasets in ISO and ZIP format 2021-07-29 08:46:59 +00:00			`self.flushrdb = data["redis"]["flushdb_on_init"]`
PoC for streamlining import PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android 2021-07-27 16:37:12 +00:00
			`self.rdb = redis.Redis(host=redis_host, port=redis_port, decode_responses=True)`

			`def download(self, dataset=False):`
Support for import of NSRL datasets in ISO and ZIP format 2021-07-29 08:46:59 +00:00			`""" Download a dataset`
			`:param dataset: The dataset to use. This is a key looked for in the config.json file to get the correct download URL`
			`"""`
PoC for streamlining import PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android 2021-07-27 16:37:12 +00:00			`if not dataset:`
			`self.error("no dataset")`

			`print("DOWNLOAD dataset {0} from {1} to {2} .".format(dataset, self.hash_datasets[dataset]["url"], self.local_path))`
			`wget.download(self.hash_datasets[dataset]["url"], self.local_path)`
			`print("\nDownload completed.")`

Support for import of NSRL datasets in ISO and ZIP format 2021-07-29 08:46:59 +00:00			`def __process_nsrl_support(self, isofile, dataset_file, key):`
			`""" Process support NSRL data (OS, Product, Vendor/Manufacturer)`
			`:param isofile: The object to the ISO file. When set to False this indicates the NSRL is provided in a ZIP format`
			`:param dataset_file: The location of the dataset, is either a path in the ISO or a direct filepath`
			`:param key: type of support NSRL data`
			`"""`
PoC for streamlining import PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android 2021-07-27 16:37:12 +00:00			`print("\n Work on {0}".format(dataset_file))`

Support for import of NSRL datasets in ISO and ZIP format 2021-07-29 08:46:59 +00:00			`if isofile:`
			`with_element = isofile.IsoPath("/{0}".format(dataset_file)).open()`
			`else:`
			`with_element = open(dataset_file, encoding='utf-8')`

PoC for streamlining import PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android 2021-07-27 16:37:12 +00:00			`ln = 0`
Support for import of NSRL datasets in ISO and ZIP format 2021-07-29 08:46:59 +00:00			`with with_element as f:`
PoC for streamlining import PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android 2021-07-27 16:37:12 +00:00			`while True:`
			`l = f.readline()`

			`if not l:`
			`break`

			`if ln == 0:`
			`headers = l.rstrip().replace("\"", "").split(",")`
			`else:`
			`records = l.rstrip().replace("\"", "").split(",")`
			`drecords = {}`
			`for index, value in enumerate(records):`
			`try:`
			`drecords[headers[index]] = value`
			`except:`
			`continue`

Support for import of NSRL datasets in ISO and ZIP format 2021-07-29 08:46:59 +00:00			`self.rdb.sadd("s:{0}".format(key), drecords[key])`
			`self.rdb.hmset("h-{0}:{1}".format(key, drecords[key]), drecords)`
			`stat_import_key = dataset_file[dataset_file.rfind("/")+1:dataset_file.rfind(".txt")]`
			`self.rdb.incrby("stat:{0}-import".format(stat_import_key))`
PoC for streamlining import PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android 2021-07-27 16:37:12 +00:00			`if ln % self.mod_lines == 0:`
			`print(" Imported {0} records.".format(ln))`

			`if ln == self.max_value:`
			`break`

			`ln = ln + 1`
			`print(" Finished, importing {0} records.".format(ln))`

Support for import of NSRL datasets in ISO and ZIP format 2021-07-29 08:46:59 +00:00			`def __process_nsrl_base(self, isofile, dataset_file, rdbkey):`
			`""" Process base NSRL data (file hashes)`
			`:param isofile: The object to the ISO file. When set to False this indicates the NSRL is provided in a ZIP format`
			`:param dataset_file: The location of the dataset, is either a path in the ISO or a direct filepath`
			`:param rdbkey: redis database key (corresponds with key of dataset in config.json)`
			`"""`
PoC for streamlining import PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android 2021-07-27 16:37:12 +00:00			`print("\n Work on {0}".format(dataset_file))`

Support for import of NSRL datasets in ISO and ZIP format 2021-07-29 08:46:59 +00:00			`if isofile:`
			`# We received the NSRL dataset as an ISO file`
			`# First get the ZIP from the ISO and then extract the ZIP`
			`zip_f = open(self.local_path + dataset_file, "wb")`
			`with isofile.IsoPath("/{0}".format(dataset_file)).open("rb") as f:`
			`zip_f.write((f.read()))`
			`zip_f.close()`
			`zip_f = zipfile.ZipFile(self.local_path + dataset_file)`
			`zip_f.extractall(self.local_path)`

			`local_dataset_file = self.local_path + "NSRLFile.txt"`
			`else:`
			`# No need to do additional actions`
			`# We probably received the NSRL as a ZIP file`
			`local_dataset_file = dataset_file`

PoC for streamlining import PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android 2021-07-27 16:37:12 +00:00			`ln = 0`
Support for import of NSRL datasets in ISO and ZIP format 2021-07-29 08:46:59 +00:00			`lines = open(local_dataset_file, "r")`
PoC for streamlining import PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android 2021-07-27 16:37:12 +00:00
			`for l in lines:`
			`if ln == 0:`
			`headers = l.rstrip().replace("\"", "").split(",")`
			`else:`
			`records = l.rstrip().replace("\"", "").split(",")`
			`drecords = {}`
			`for index, value in enumerate(records):`
			`try:`
			`drecords[headers[index]] = value`
			`except:`
			`continue`

Support for import of NSRL datasets in ISO and ZIP format 2021-07-29 08:46:59 +00:00			`# Add some meta data`
			`drecords['source'] = "NSRL"`
			`drecords['db'] = rdbkey`
			`drecords['insert-timestamp'] = time.time()`

			`# Base records`
PoC for streamlining import PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android 2021-07-27 16:37:12 +00:00			`self.rdb.set("l:{}".format(drecords['MD5']), drecords['SHA-1'])`
			`self.rdb.hmset("h:{}".format(drecords['SHA-1']), drecords)`
Support for import of NSRL datasets in ISO and ZIP format 2021-07-29 08:46:59 +00:00			`self.rdb.incrby("stat:{0}".format(rdbkey))`
PoC for streamlining import PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android 2021-07-27 16:37:12 +00:00			`if ln % self.mod_lines == 0:`
			`print(" Imported {0} records.".format(ln))`

			`if ln == self.max_value:`
			`break`
			`ln = ln + 1`
			`print(" Finished, importing {0} records.".format(ln))`

			`def process(self, dataset=False):`
Support for import of NSRL datasets in ISO and ZIP format 2021-07-29 08:46:59 +00:00			`"""Process a dataset`
			`:param dataset: The dataset to process`
			`"""`
PoC for streamlining import PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android 2021-07-27 16:37:12 +00:00			`if not dataset:`
			`self.error("no dataset")`

			`local_dataset = self.local_path + self.hash_datasets[dataset]["url"][self.hash_datasets[dataset]["url"].rfind("/")+1:]`
			`local_dataset.lower()`
			`print("PROCESS dataset {0} from location {1} .".format(dataset, local_dataset))`

			`if not Path(local_dataset).is_file():`
			`self.error("Cannot find file {0}".format(local_dataset))`

			`# Determine dataset file type`
			`dataset_file_type = local_dataset[local_dataset.rfind(".")+1:]`

			`if dataset_file_type == "iso":`
Support for import of NSRL datasets in ISO and ZIP format 2021-07-29 08:46:59 +00:00			`# We read directly from the ISO file`
PoC for streamlining import PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android 2021-07-27 16:37:12 +00:00			`isofile = pathlab.IsoAccessor(local_dataset)`

Support for import of NSRL datasets in ISO and ZIP format 2021-07-29 08:46:59 +00:00			`self.__process_nsrl_base(isofile, "NSRLFILE.ZIP", dataset)`
			`self.__process_nsrl_support(isofile, "NSRLMFG.TXT", "MfgCode")`
			`self.__process_nsrl_support(isofile, "NSRLOS.TXT", "OpSystemCode")`
			`self.__process_nsrl_support(isofile, "NSRLPROD.TXT", "ProductCode")`
			`elif dataset_file_type == "zip":`
			`# Extract the ZIP`
			`zip_f = zipfile.ZipFile(local_dataset)`
			`zip_f.extractall(self.local_path)`
			`# NSRL ZIPs store the datafiles in a subdirectory`
			`namelist_first = zip_f.namelist()[0]`
			`zip_extract_path = ""`
			`if namelist_first[-1] == "/":`
			`zip_extract_path = self.local_path + namelist_first`
			`# Indicate we don't have an ISO object`
			`isofile = False`

			`self.__process_nsrl_base(isofile, zip_extract_path + "NSRLFile.txt", dataset)`
			`self.__process_nsrl_support(isofile, zip_extract_path + "NSRLMfg.txt", "MfgCode")`
			`self.__process_nsrl_support(isofile, zip_extract_path + "NSRLOS.txt", "OpSystemCode")`
			`self.__process_nsrl_support(isofile, zip_extract_path + "NSRLProd.txt", "ProductCode")`
PoC for streamlining import PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android 2021-07-27 16:37:12 +00:00
			`def init(self, dataset=False):`
Support for import of NSRL datasets in ISO and ZIP format 2021-07-29 08:46:59 +00:00			`""" Remove / Initialize a dataset`
			`:param dataset: Affected dataset`
			`"""`
PoC for streamlining import PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android 2021-07-27 16:37:12 +00:00			`if not dataset:`
			`self.error("no dataset")`

			`print("INIT dataset {0} .".format(dataset))`

Support for import of NSRL datasets in ISO and ZIP format 2021-07-29 08:46:59 +00:00			`if self.flushrdb:`
chg: [import-poc] never flush 2021-12-03 22:40:25 +00:00			`pass`
Support for import of NSRL datasets in ISO and ZIP format 2021-07-29 08:46:59 +00:00			`else:`
			`self.rdb.delete("stat:{0}".format(dataset))`
			`self.rdb.set("stat:{0}".format(dataset), 0)`

			`def datasetlist(self):`
			`""" List the available datasets`
			`"""`
			`for nsrl in self.hash_datasets:`
			`print("{0}\n {1}\n from: {2}\n".format(nsrl, self.hash_datasets[nsrl]["description"], self.hash_datasets[nsrl]["url"]))`

			`def valid_dataset(self, dataset):`
			`""" Verify if the datset exist`
			`:param dataset: Affected dataset`
			`"""`
			`if dataset in self.hash_datasets:`
			`return True`
			`else:`
			`return False`
PoC for streamlining import PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android 2021-07-27 16:37:12 +00:00
			`def error(self, error):`
Support for import of NSRL datasets in ISO and ZIP format 2021-07-29 08:46:59 +00:00			`""" Return an error message and exit`
			`:param error: Error message`
			`"""`
PoC for streamlining import PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android 2021-07-27 16:37:12 +00:00			`print("!!ERROR!! {0}".format(error))`
			`sys.exit()`


Support for import of NSRL datasets in ISO and ZIP format 2021-07-29 08:46:59 +00:00			`parser = argparse.ArgumentParser()`
			`group = parser.add_mutually_exclusive_group()`
			`group.add_argument("-l", "--list", action="store_true", help="List datasets available for download and import.")`
			`group.add_argument("-i", "--import-dataset", help="Import a dataset.")`
			`group.add_argument("-e", "--init-dataset", help="Remove / initialize a dataset.")`
			`parser.add_argument("-d", "--skip-download", action="store_true", help="Skip downloading the dataset.")`
			`parser.add_argument("-c", "--skip-init", action="store_true", help="Skip initialization of the database.")`
			`args = parser.parse_args()`

PoC for streamlining import PoC to better streamline the import of NSRL data. Still requires some work but basic concept works. Currently only tested with Android 2021-07-27 16:37:12 +00:00			`import_hash = import_hash()`
Support for import of NSRL datasets in ISO and ZIP format 2021-07-29 08:46:59 +00:00
			`if args.list:`
			`import_hash.datasetlist()`
			`elif args.import_dataset:`
			`dataset = args.import_dataset`
			`if import_hash.valid_dataset(dataset):`
			`if not args.skip_download:`
			`import_hash.download(dataset=dataset)`
			`if not args.skip_init:`
			`import_hash.init(dataset=dataset)`
			`import_hash.process(dataset=dataset)`
			`else:`
			`print("Dataset not found.")`
			`elif args.init_dataset:`
			`dataset = args.init_dataset`
			`if import_hash.valid_dataset(dataset):`
			`import_hash.init(dataset=dataset)`
			`else:`
			`print("Dataset not found.")`