mirror of
https://github.com/adulau/hashlookup-server.git
synced 2024-11-22 10:07:11 +00:00
Support for import of NSRL datasets in ISO and ZIP format
This commit is contained in:
parent
d280216361
commit
ca910ff22f
3 changed files with 165 additions and 32 deletions
|
@ -2,9 +2,33 @@
|
||||||
|
|
||||||
PoC to better streamline the import of NSRL data.
|
PoC to better streamline the import of NSRL data.
|
||||||
|
|
||||||
Todo:
|
## Usage
|
||||||
|
|
||||||
- Test with the other data sets (currently only Android was tested) : Fetch from ZIP and not ISO file
|
```
|
||||||
|
$ python3 import-hashlookup-server.py -h
|
||||||
|
usage: import-hashlookup-server.py [-h] [-l | -i IMPORT_DATASET | -e INIT_DATASET] [-d] [-c]
|
||||||
|
|
||||||
|
optional arguments:
|
||||||
|
-h, --help show this help message and exit
|
||||||
|
-l, --list List datasets available for download and import.
|
||||||
|
-i IMPORT_DATASET, --import-dataset IMPORT_DATASET
|
||||||
|
Import a dataset.
|
||||||
|
-e INIT_DATASET, --init-dataset INIT_DATASET
|
||||||
|
Remove / initialize a dataset.
|
||||||
|
-d, --skip-download Skip downloading the dataset.
|
||||||
|
-c, --skip-init Skip initialization of the database.
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
$ python3 import-hashlookup-server.py -i nsrl_minimal
|
||||||
|
```
|
||||||
|
|
||||||
|
## Todo
|
||||||
|
|
||||||
|
|
||||||
|
- ~~Test with the other data sets (currently only Android was tested) : Fetch from ZIP and not ISO file~~
|
||||||
|
- Move older input scripts to "old" directory
|
||||||
|
- Complete with sha256 and xcycl
|
||||||
- Error handling (sufficient drive space, Redis active, check if there is already a db before init)
|
- Error handling (sufficient drive space, Redis active, check if there is already a db before init)
|
||||||
- Multiple data sets at once?
|
- Multiple data sets at once?
|
||||||
- Import from MISP (depends on filter)
|
- Import from MISP (depends on filter)
|
|
@ -4,15 +4,20 @@
|
||||||
"nsrl_minimal": { "description": "(minimal) - contains the set of DISTINCT appearances of files in modern applications; no file entries are duplicated",
|
"nsrl_minimal": { "description": "(minimal) - contains the set of DISTINCT appearances of files in modern applications; no file entries are duplicated",
|
||||||
"url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernm.zip"},
|
"url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernm.zip"},
|
||||||
"nsrl_android": { "description": "Contains modern Android mobile applications",
|
"nsrl_android": { "description": "Contains modern Android mobile applications",
|
||||||
"url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_android.iso"}
|
"url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_android.iso"},
|
||||||
|
"nsrl_unique": { "description": "Contains the set of file entries that appear ONLY ONCE in the entire NSRL collection; these are unique to each off the applications that are in the collection",
|
||||||
|
"url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernu.zip"},
|
||||||
|
"nsrl_ios": { "description": "IOS dataset",
|
||||||
|
"url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_ios.iso"}
|
||||||
},
|
},
|
||||||
"local_path": "/home/koenv/nsrl/",
|
"local_path": "/home/koenv/nsrl/",
|
||||||
"import": {
|
"import": {
|
||||||
"max_value": 500000000,
|
"max_value": 500000000,
|
||||||
"mod_lines": 1000
|
"mod_lines": 2500
|
||||||
},
|
},
|
||||||
"redis": {
|
"redis": {
|
||||||
"hostname": "127.0.0.1",
|
"hostname": "127.0.0.1",
|
||||||
"port": 6666
|
"port": 6666,
|
||||||
|
"flushdb_on_init": 1
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -5,6 +5,8 @@ import wget
|
||||||
import sys
|
import sys
|
||||||
import redis
|
import redis
|
||||||
import json
|
import json
|
||||||
|
import time
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
class import_hash:
|
class import_hash:
|
||||||
|
@ -17,10 +19,14 @@ class import_hash:
|
||||||
self.local_path = data["local_path"]
|
self.local_path = data["local_path"]
|
||||||
redis_host = data["redis"]["hostname"]
|
redis_host = data["redis"]["hostname"]
|
||||||
redis_port = data["redis"]["port"]
|
redis_port = data["redis"]["port"]
|
||||||
|
self.flushrdb = data["redis"]["flushdb_on_init"]
|
||||||
|
|
||||||
self.rdb = redis.Redis(host=redis_host, port=redis_port, decode_responses=True)
|
self.rdb = redis.Redis(host=redis_host, port=redis_port, decode_responses=True)
|
||||||
|
|
||||||
def download(self, dataset=False):
|
def download(self, dataset=False):
|
||||||
|
""" Download a dataset
|
||||||
|
:param dataset: The dataset to use. This is a key looked for in the config.json file to get the correct download URL
|
||||||
|
"""
|
||||||
if not dataset:
|
if not dataset:
|
||||||
self.error("no dataset")
|
self.error("no dataset")
|
||||||
|
|
||||||
|
@ -28,12 +34,21 @@ class import_hash:
|
||||||
wget.download(self.hash_datasets[dataset]["url"], self.local_path)
|
wget.download(self.hash_datasets[dataset]["url"], self.local_path)
|
||||||
print("\nDownload completed.")
|
print("\nDownload completed.")
|
||||||
|
|
||||||
def __process_nsrl_txt(self, isofile, dataset_file, key1, key2):
|
def __process_nsrl_support(self, isofile, dataset_file, key):
|
||||||
|
""" Process support NSRL data (OS, Product, Vendor/Manufacturer)
|
||||||
|
:param isofile: The object to the ISO file. When set to False this indicates the NSRL is provided in a ZIP format
|
||||||
|
:param dataset_file: The location of the dataset, is either a path in the ISO or a direct filepath
|
||||||
|
:param key: type of support NSRL data
|
||||||
|
"""
|
||||||
print("\n Work on {0}".format(dataset_file))
|
print("\n Work on {0}".format(dataset_file))
|
||||||
|
|
||||||
|
if isofile:
|
||||||
|
with_element = isofile.IsoPath("/{0}".format(dataset_file)).open()
|
||||||
|
else:
|
||||||
|
with_element = open(dataset_file, encoding='utf-8')
|
||||||
|
|
||||||
ln = 0
|
ln = 0
|
||||||
with isofile.IsoPath("/{0}".format(dataset_file)).open() as f:
|
with with_element as f:
|
||||||
while True:
|
while True:
|
||||||
l = f.readline()
|
l = f.readline()
|
||||||
|
|
||||||
|
@ -51,9 +66,10 @@ class import_hash:
|
||||||
except:
|
except:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
self.rdb.sadd("s:{0}".format(key1), drecords[key1])
|
self.rdb.sadd("s:{0}".format(key), drecords[key])
|
||||||
self.rdb.hmset("h-{0}:{1}".format(key1, drecords[key1]), drecords)
|
self.rdb.hmset("h-{0}:{1}".format(key, drecords[key]), drecords)
|
||||||
self.rdb.incrby("stat:{0}-import".format(key2))
|
stat_import_key = dataset_file[dataset_file.rfind("/")+1:dataset_file.rfind(".txt")]
|
||||||
|
self.rdb.incrby("stat:{0}-import".format(stat_import_key))
|
||||||
if ln % self.mod_lines == 0:
|
if ln % self.mod_lines == 0:
|
||||||
print(" Imported {0} records.".format(ln))
|
print(" Imported {0} records.".format(ln))
|
||||||
|
|
||||||
|
@ -63,9 +79,16 @@ class import_hash:
|
||||||
ln = ln + 1
|
ln = ln + 1
|
||||||
print(" Finished, importing {0} records.".format(ln))
|
print(" Finished, importing {0} records.".format(ln))
|
||||||
|
|
||||||
def __process_nsrl_zip(self, isofile, dataset_file, key):
|
def __process_nsrl_base(self, isofile, dataset_file, rdbkey):
|
||||||
|
""" Process base NSRL data (file hashes)
|
||||||
|
:param isofile: The object to the ISO file. When set to False this indicates the NSRL is provided in a ZIP format
|
||||||
|
:param dataset_file: The location of the dataset, is either a path in the ISO or a direct filepath
|
||||||
|
:param rdbkey: redis database key (corresponds with key of dataset in config.json)
|
||||||
|
"""
|
||||||
print("\n Work on {0}".format(dataset_file))
|
print("\n Work on {0}".format(dataset_file))
|
||||||
|
|
||||||
|
if isofile:
|
||||||
|
# We received the NSRL dataset as an ISO file
|
||||||
# First get the ZIP from the ISO and then extract the ZIP
|
# First get the ZIP from the ISO and then extract the ZIP
|
||||||
zip_f = open(self.local_path + dataset_file, "wb")
|
zip_f = open(self.local_path + dataset_file, "wb")
|
||||||
with isofile.IsoPath("/{0}".format(dataset_file)).open("rb") as f:
|
with isofile.IsoPath("/{0}".format(dataset_file)).open("rb") as f:
|
||||||
|
@ -74,8 +97,14 @@ class import_hash:
|
||||||
zip_f = zipfile.ZipFile(self.local_path + dataset_file)
|
zip_f = zipfile.ZipFile(self.local_path + dataset_file)
|
||||||
zip_f.extractall(self.local_path)
|
zip_f.extractall(self.local_path)
|
||||||
|
|
||||||
|
local_dataset_file = self.local_path + "NSRLFile.txt"
|
||||||
|
else:
|
||||||
|
# No need to do additional actions
|
||||||
|
# We probably received the NSRL as a ZIP file
|
||||||
|
local_dataset_file = dataset_file
|
||||||
|
|
||||||
ln = 0
|
ln = 0
|
||||||
lines = open(self.local_path + "NSRLFile.txt", "r")
|
lines = open(local_dataset_file, "r")
|
||||||
|
|
||||||
for l in lines:
|
for l in lines:
|
||||||
if ln == 0:
|
if ln == 0:
|
||||||
|
@ -89,9 +118,15 @@ class import_hash:
|
||||||
except:
|
except:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Add some meta data
|
||||||
|
drecords['source'] = "NSRL"
|
||||||
|
drecords['db'] = rdbkey
|
||||||
|
drecords['insert-timestamp'] = time.time()
|
||||||
|
|
||||||
|
# Base records
|
||||||
self.rdb.set("l:{}".format(drecords['MD5']), drecords['SHA-1'])
|
self.rdb.set("l:{}".format(drecords['MD5']), drecords['SHA-1'])
|
||||||
self.rdb.hmset("h:{}".format(drecords['SHA-1']), drecords)
|
self.rdb.hmset("h:{}".format(drecords['SHA-1']), drecords)
|
||||||
self.rdb.incrby("stat:{0}".format(key))
|
self.rdb.incrby("stat:{0}".format(rdbkey))
|
||||||
if ln % self.mod_lines == 0:
|
if ln % self.mod_lines == 0:
|
||||||
print(" Imported {0} records.".format(ln))
|
print(" Imported {0} records.".format(ln))
|
||||||
|
|
||||||
|
@ -101,6 +136,9 @@ class import_hash:
|
||||||
print(" Finished, importing {0} records.".format(ln))
|
print(" Finished, importing {0} records.".format(ln))
|
||||||
|
|
||||||
def process(self, dataset=False):
|
def process(self, dataset=False):
|
||||||
|
"""Process a dataset
|
||||||
|
:param dataset: The dataset to process
|
||||||
|
"""
|
||||||
if not dataset:
|
if not dataset:
|
||||||
self.error("no dataset")
|
self.error("no dataset")
|
||||||
|
|
||||||
|
@ -115,28 +153,94 @@ class import_hash:
|
||||||
dataset_file_type = local_dataset[local_dataset.rfind(".")+1:]
|
dataset_file_type = local_dataset[local_dataset.rfind(".")+1:]
|
||||||
|
|
||||||
if dataset_file_type == "iso":
|
if dataset_file_type == "iso":
|
||||||
|
# We read directly from the ISO file
|
||||||
isofile = pathlab.IsoAccessor(local_dataset)
|
isofile = pathlab.IsoAccessor(local_dataset)
|
||||||
|
|
||||||
self.__process_nsrl_zip(isofile, "NSRLFILE.ZIP", "NSRLAndroid")
|
self.__process_nsrl_base(isofile, "NSRLFILE.ZIP", dataset)
|
||||||
self.__process_nsrl_txt(isofile, "NSRLMFG.TXT", "MfgCode", "NSRLMfg")
|
self.__process_nsrl_support(isofile, "NSRLMFG.TXT", "MfgCode")
|
||||||
self.__process_nsrl_txt(isofile, "NSRLOS.TXT", "OpSystemCode", "NSRLOS")
|
self.__process_nsrl_support(isofile, "NSRLOS.TXT", "OpSystemCode")
|
||||||
self.__process_nsrl_txt(isofile, "NSRLPROD.TXT", "ProductCode", "NSRLProd")
|
self.__process_nsrl_support(isofile, "NSRLPROD.TXT", "ProductCode")
|
||||||
|
elif dataset_file_type == "zip":
|
||||||
|
# Extract the ZIP
|
||||||
|
zip_f = zipfile.ZipFile(local_dataset)
|
||||||
|
zip_f.extractall(self.local_path)
|
||||||
|
# NSRL ZIPs store the datafiles in a subdirectory
|
||||||
|
namelist_first = zip_f.namelist()[0]
|
||||||
|
zip_extract_path = ""
|
||||||
|
if namelist_first[-1] == "/":
|
||||||
|
zip_extract_path = self.local_path + namelist_first
|
||||||
|
# Indicate we don't have an ISO object
|
||||||
|
isofile = False
|
||||||
|
|
||||||
|
self.__process_nsrl_base(isofile, zip_extract_path + "NSRLFile.txt", dataset)
|
||||||
|
self.__process_nsrl_support(isofile, zip_extract_path + "NSRLMfg.txt", "MfgCode")
|
||||||
|
self.__process_nsrl_support(isofile, zip_extract_path + "NSRLOS.txt", "OpSystemCode")
|
||||||
|
self.__process_nsrl_support(isofile, zip_extract_path + "NSRLProd.txt", "ProductCode")
|
||||||
|
|
||||||
def init(self, dataset=False):
|
def init(self, dataset=False):
|
||||||
|
""" Remove / Initialize a dataset
|
||||||
|
:param dataset: Affected dataset
|
||||||
|
"""
|
||||||
if not dataset:
|
if not dataset:
|
||||||
self.error("no dataset")
|
self.error("no dataset")
|
||||||
|
|
||||||
print("**INIT** dataset {0} .".format(dataset))
|
print("**INIT** dataset {0} .".format(dataset))
|
||||||
|
|
||||||
|
if self.flushrdb:
|
||||||
|
self.rdb.flushdb()
|
||||||
|
else:
|
||||||
self.rdb.delete("stat:{0}".format(dataset))
|
self.rdb.delete("stat:{0}".format(dataset))
|
||||||
self.rdb.set("stat:{0}".format(dataset), 0)
|
self.rdb.set("stat:{0}".format(dataset), 0)
|
||||||
|
|
||||||
|
def datasetlist(self):
|
||||||
|
""" List the available datasets
|
||||||
|
"""
|
||||||
|
for nsrl in self.hash_datasets:
|
||||||
|
print("{0}\n {1}\n from: {2}\n".format(nsrl, self.hash_datasets[nsrl]["description"], self.hash_datasets[nsrl]["url"]))
|
||||||
|
|
||||||
|
def valid_dataset(self, dataset):
|
||||||
|
""" Verify if the datset exist
|
||||||
|
:param dataset: Affected dataset
|
||||||
|
"""
|
||||||
|
if dataset in self.hash_datasets:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
def error(self, error):
|
def error(self, error):
|
||||||
|
""" Return an error message and exit
|
||||||
|
:param error: Error message
|
||||||
|
"""
|
||||||
print("!!ERROR!! {0}".format(error))
|
print("!!ERROR!! {0}".format(error))
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
group = parser.add_mutually_exclusive_group()
|
||||||
|
group.add_argument("-l", "--list", action="store_true", help="List datasets available for download and import.")
|
||||||
|
group.add_argument("-i", "--import-dataset", help="Import a dataset.")
|
||||||
|
group.add_argument("-e", "--init-dataset", help="Remove / initialize a dataset.")
|
||||||
|
parser.add_argument("-d", "--skip-download", action="store_true", help="Skip downloading the dataset.")
|
||||||
|
parser.add_argument("-c", "--skip-init", action="store_true", help="Skip initialization of the database.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
import_hash = import_hash()
|
import_hash = import_hash()
|
||||||
#import_hash.download(dataset="nsrl_android")
|
|
||||||
import_hash.init(dataset="nsrl_android")
|
if args.list:
|
||||||
import_hash.process(dataset="nsrl_android")
|
import_hash.datasetlist()
|
||||||
|
elif args.import_dataset:
|
||||||
|
dataset = args.import_dataset
|
||||||
|
if import_hash.valid_dataset(dataset):
|
||||||
|
if not args.skip_download:
|
||||||
|
import_hash.download(dataset=dataset)
|
||||||
|
if not args.skip_init:
|
||||||
|
import_hash.init(dataset=dataset)
|
||||||
|
import_hash.process(dataset=dataset)
|
||||||
|
else:
|
||||||
|
print("Dataset not found.")
|
||||||
|
elif args.init_dataset:
|
||||||
|
dataset = args.init_dataset
|
||||||
|
if import_hash.valid_dataset(dataset):
|
||||||
|
import_hash.init(dataset=dataset)
|
||||||
|
else:
|
||||||
|
print("Dataset not found.")
|
||||||
|
|
Loading…
Reference in a new issue