PoC for streamlining import

PoC to better streamline the import of NSRL data.
Still requires some work but basic concept works.
Currently only tested with Android
This commit is contained in:
Koen Van Impe 2021-07-27 18:37:12 +02:00
parent 97e2d2c8aa
commit 0edb33ed82
4 changed files with 175 additions and 0 deletions

10
bin/import-poc/README.md Normal file
View file

@ -0,0 +1,10 @@
# hashlookup-server
PoC to better streamline the import of NSRL data.
Todo:
- Test with the other data sets (currently only Android was tested) : Fetch from ZIP and not ISO file
- Error handling (sufficient drive space, Redis active, check if there is already a db before init)
- Multiple data sets at once?
- Import from MISP (depends on filter)

View file

@ -0,0 +1,18 @@
{ "nsrl_downloads": {
"nsrl_modern_rds": { "description": "(microcomputer applications) - contains the comprehensive set of ALL appearances of files in modern applications; many file entries are duplicated",
"url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_modern.iso"},
"nsrl_minimal": { "description": "(minimal) - contains the set of DISTINCT appearances of files in modern applications; no file entries are duplicated",
"url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernm.zip"},
"nsrl_android": { "description": "Contains modern Android mobile applications",
"url": "https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/RDS_android.iso"}
},
"local_path": "/home/koenv/nsrl/",
"import": {
"max_value": 500000000,
"mod_lines": 1000
},
"redis": {
"hostname": "127.0.0.1",
"port": 6666
}
}

View file

@ -0,0 +1,142 @@
from pathlib import Path
import pathlab
import zipfile
import wget
import sys
import redis
import json
class import_hash:
def __init__(self):
with open('config.json') as config_file:
data = json.load(config_file)
self.hash_datasets = data["nsrl_downloads"]
self.max_value = data["import"]["max_value"]
self.mod_lines = data["import"]["mod_lines"]
self.local_path = data["local_path"]
redis_host = data["redis"]["hostname"]
redis_port = data["redis"]["port"]
self.rdb = redis.Redis(host=redis_host, port=redis_port, decode_responses=True)
def download(self, dataset=False):
if not dataset:
self.error("no dataset")
print("**DOWNLOAD** dataset {0} from {1} to {2} .".format(dataset, self.hash_datasets[dataset]["url"], self.local_path))
wget.download(self.hash_datasets[dataset]["url"], self.local_path)
print("\nDownload completed.")
def __process_nsrl_txt(self, isofile, dataset_file, key1, key2):
print("\n Work on {0}".format(dataset_file))
ln = 0
with isofile.IsoPath("/{0}".format(dataset_file)).open() as f:
while True:
l = f.readline()
if not l:
break
if ln == 0:
headers = l.rstrip().replace("\"", "").split(",")
else:
records = l.rstrip().replace("\"", "").split(",")
drecords = {}
for index, value in enumerate(records):
try:
drecords[headers[index]] = value
except:
continue
self.rdb.sadd("s:{0}".format(key1), drecords[key1])
self.rdb.hmset("h-{0}:{1}".format(key1, drecords[key1]), drecords)
self.rdb.incrby("stat:{0}-import".format(key2))
if ln % self.mod_lines == 0:
print(" Imported {0} records.".format(ln))
if ln == self.max_value:
break
ln = ln + 1
print(" Finished, importing {0} records.".format(ln))
def __process_nsrl_zip(self, isofile, dataset_file, key):
print("\n Work on {0}".format(dataset_file))
# First get the ZIP from the ISO and then extract the ZIP
zip_f = open(self.local_path + dataset_file, "wb")
with isofile.IsoPath("/{0}".format(dataset_file)).open("rb") as f:
zip_f.write((f.read()))
zip_f.close()
zip_f = zipfile.ZipFile(self.local_path + dataset_file)
zip_f.extractall(self.local_path)
ln = 0
lines = open(self.local_path + "NSRLFile.txt", "r")
for l in lines:
if ln == 0:
headers = l.rstrip().replace("\"", "").split(",")
else:
records = l.rstrip().replace("\"", "").split(",")
drecords = {}
for index, value in enumerate(records):
try:
drecords[headers[index]] = value
except:
continue
self.rdb.set("l:{}".format(drecords['MD5']), drecords['SHA-1'])
self.rdb.hmset("h:{}".format(drecords['SHA-1']), drecords)
self.rdb.incrby("stat:{0}".format(key))
if ln % self.mod_lines == 0:
print(" Imported {0} records.".format(ln))
if ln == self.max_value:
break
ln = ln + 1
print(" Finished, importing {0} records.".format(ln))
def process(self, dataset=False):
if not dataset:
self.error("no dataset")
local_dataset = self.local_path + self.hash_datasets[dataset]["url"][self.hash_datasets[dataset]["url"].rfind("/")+1:]
local_dataset.lower()
print("**PROCESS** dataset {0} from location {1} .".format(dataset, local_dataset))
if not Path(local_dataset).is_file():
self.error("Cannot find file {0}".format(local_dataset))
# Determine dataset file type
dataset_file_type = local_dataset[local_dataset.rfind(".")+1:]
if dataset_file_type == "iso":
isofile = pathlab.IsoAccessor(local_dataset)
self.__process_nsrl_zip(isofile, "NSRLFILE.ZIP", "NSRLAndroid")
self.__process_nsrl_txt(isofile, "NSRLMFG.TXT", "MfgCode", "NSRLMfg")
self.__process_nsrl_txt(isofile, "NSRLOS.TXT", "OpSystemCode", "NSRLOS")
self.__process_nsrl_txt(isofile, "NSRLPROD.TXT", "ProductCode", "NSRLProd")
def init(self, dataset=False):
if not dataset:
self.error("no dataset")
print("**INIT** dataset {0} .".format(dataset))
self.rdb.delete("stat:{0}".format(dataset))
self.rdb.set("stat:{0}".format(dataset), 0)
def error(self, error):
print("!!ERROR!! {0}".format(error))
sys.exit()
import_hash = import_hash()
#import_hash.download(dataset="nsrl_android")
import_hash.init(dataset="nsrl_android")
import_hash.process(dataset="nsrl_android")

View file

@ -0,0 +1,5 @@
pathlab
wget
json
zipfile
redis