hashlookup-server/bin/server.py

546 lines
20 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python
version = "1.2"
2022-01-15 13:56:47 +00:00
from flask import (
Flask,
url_for,
send_from_directory,
render_template,
make_response,
request,
)
from flask_restx import Resource, Api, reqparse
import redis
import configparser
import json
config = configparser.ConfigParser()
config.read('../etc/server.conf')
stats = config['global'].getboolean('stats')
stats_pubsub = config['global'].getboolean('stats_pubsub')
stats_public = config['global'].getboolean('stats_public')
score = 1
session = config['session'].getboolean('enable')
session_ttl = config['session'].get('ttl')
app = Flask(__name__)
app.url_map.strict_slashes = False
2022-01-15 13:56:47 +00:00
api = Api(
app,
version=version,
title='hashlookup CIRCL API',
description='![](https://www.circl.lu/assets/images/circl-logo.png)\n[CIRCL hash lookup](https://hashlookup.circl.lu/) is a public API to lookup hash values against known database of files. For more details about all the datasets included [visit the website of the project](https://www.circl.lu/services/hashlookup/). The API is accessible via HTTP ReST API and the API is also [described as an OpenAPI](https://hashlookup.circl.lu/swagger.json). A [documentation is available with](https://www.circl.lu/services/hashlookup/) with sample queries and software using hashlookup. An offline version as Bloom filter is also [available](https://circl.lu/services/hashlookup/#how-to-quickly-check-a-set-of-files-in-a-local-directory). The API can be tested live in the interface below.',
doc='/',
license='CC-BY',
contact='info@circl.lu',
ordered=True,
)
rdb = redis.Redis(host='127.0.0.1', port='6666', decode_responses=True)
2022-01-15 13:56:47 +00:00
def is_hex(s):
try:
int(s, 16)
return True
except ValueError:
return False
2022-01-15 13:56:47 +00:00
def check_md5(value=None):
if value is None or len(value) != 32:
return False
if not is_hex(value):
return False
k = value.upper()
return k
2022-01-15 13:56:47 +00:00
def check_sha1(value=None):
if value is None or len(value) != 40:
return False
if not is_hex(value):
return False
k = value.upper()
return k
2022-01-15 13:56:47 +00:00
def check_sha256(value=None):
if value is None or len(value) != 64:
return False
if not is_hex(value):
return False
k = value.upper()
return k
2022-01-15 13:56:47 +00:00
def client_info():
if request.environ.get('HTTP_X_FORWARDED_FOR') is None:
ip = request.environ['REMOTE_ADDR']
else:
ip = request.environ['HTTP_X_FORWARDED_FOR']
user_agent = request.headers.get('User-Agent')
if request.environ.get('HTTP_AUTHORIZATION') is not None:
auth = request.environ.get('HTTP_AUTHORIZATION')
else:
auth = None
2022-01-15 13:56:47 +00:00
return {'ip_addr': ip, 'user_agent': user_agent, 'auth': auth}
def pub_lookup(channel=None, k=None):
if channel is None:
return False
if k is None:
return False
client = client_info()
client['value'] = k
rdb.publish(channel, json.dumps(client))
return True
2022-01-15 13:56:47 +00:00
def get_session():
if session is False:
return False
if request.headers.get('hashlookup_session') is None:
return False
session_name = request.headers.get('hashlookup_session')
if not rdb.exists("session:{}".format(session_name)):
return False
print("Using session_name: {}".format(session_name))
ttl = rdb.ttl("session:{}".format(session_name))
return ttl
2022-01-15 13:56:47 +00:00
def calculate_trust(hobject=None):
"""Trust level is between 0 and 100. 50 means we don't know the trust. Above 50, the trust level is more important as the file has been seen on various sources."""
if hobject is None:
return False
hashlookup_trust = 50
if 'hashlookup:parent-total' in hobject:
2022-01-15 13:56:47 +00:00
hashlookup_trust = hashlookup_trust + (5 * hobject['hashlookup:parent-total'])
if 'KnownMalicious' in hobject:
hashlookup_trust = hashlookup_trust - 20
if hashlookup_trust > 100:
hashlookup_trust = 100
hobject['hashlookup:trust'] = hashlookup_trust
return hobject
2022-01-15 13:56:47 +00:00
@api.route('/lookup/md5/<string:md5>')
@api.doc(description="Lookup MD5.")
class lookup(Resource):
def get(self, md5):
if check_md5(value=md5) is False:
2022-01-15 13:56:47 +00:00
return {
'message': 'MD5 value incorrect, expecting a MD5 value in hex format'
}, 400
k = check_md5(value=md5)
ttl = False
if session:
ttl = get_session()
if not (rdb.exists("l:{}".format(k)) or rdb.exists("h:{}".format(k))):
if stats:
rdb.zincrby("s:nx:md5", score, k)
if stats_pubsub:
pub_lookup(channel='nx', k=k)
if session and ttl is not False:
2022-01-15 13:56:47 +00:00
session_key = "session:{}:nx".format(
request.headers.get('hashlookup_session')
)
rdb.sadd(session_key, k)
rdb.expire(session_key, ttl)
return {'message': 'Non existing MD5', 'query': md5}, 404
if stats:
rdb.zincrby("s:exist:md5", score, k)
if stats_pubsub:
pub_lookup(channel='exist', k=k)
if session and ttl is not False:
2022-01-15 13:56:47 +00:00
session_key = "session:{}:exist".format(
request.headers.get('hashlookup_session')
)
rdb.sadd(session_key, k)
rdb.expire(session_key, ttl)
if rdb.exists("h:{}".format(k)) and not rdb.exists("l:{}".format(k)):
h = rdb.hgetall("h:{}".format(k))
sha1 = k
else:
sha1 = rdb.get("l:{}".format(k))
h = rdb.hgetall("h:{}".format(sha1))
if "OpSystemCode" in h:
if rdb.exists("h-OpSystemCode:{}".format(h['OpSystemCode'])):
2022-01-15 13:56:47 +00:00
h['OpSystemCode'] = rdb.hgetall(
"h-OpSystemCode:{}".format(h['OpSystemCode'])
)
if "ProductCode" in h:
if rdb.exists("h-ProductCode:{}".format(h['ProductCode'])):
2022-01-15 13:56:47 +00:00
h['ProductCode'] = rdb.hgetall(
"h-ProductCode:{}".format(h['ProductCode'])
)
2021-08-22 21:23:52 +00:00
if rdb.exists("p:{}".format(sha1)):
parents = []
card = rdb.scard("p:{}".format(sha1))
if card <= 15:
p = rdb.smembers("p:{}".format(sha1))
else:
p = rdb.srandmember("p:{}".format(sha1), number=10)
h['hashlookup:parent-total'] = card
for parent in p:
2021-08-22 21:23:52 +00:00
parent_details = rdb.hgetall("h:{}".format(parent))
parents.append(parent_details)
h['parents'] = parents
if rdb.exists("c:{}".format(sha1)):
children = []
card = rdb.scard("c:{}".format(sha1))
if card <= 15:
c = rdb.smembers("c:{}".format(sha1))
else:
c = rdb.srandmember("c:{}".format(sha1), number=10)
h['hashlookup:children-total'] = card
for child in c:
child_details = rdb.hgetall("h:{}".format(child))
children.append(child_details)
h['children'] = children
h = calculate_trust(hobject=h)
return h
2022-01-15 13:56:47 +00:00
@api.route('/lookup/sha1/<string:sha1>')
@api.doc(description="Lookup SHA-1.")
class lookup(Resource):
def get(self, sha1):
if check_sha1(value=sha1) is False:
2022-01-15 13:56:47 +00:00
return {
'message': 'SHA1 value incorrect, expecting a SHA1 value in hex format'
}, 400
k = check_sha1(value=sha1)
2021-08-22 14:48:06 +00:00
ttl = False
if session:
ttl = get_session()
if not rdb.exists("h:{}".format(k)):
if stats:
rdb.zincrby("s:nx:sha1", score, k)
if stats_pubsub:
pub_lookup(channel='nx', k=k)
if session and ttl is not False:
2022-01-15 13:56:47 +00:00
session_key = "session:{}:nx".format(
request.headers.get('hashlookup_session')
)
rdb.sadd(session_key, k)
rdb.expire(session_key, ttl)
return {'message': 'Non existing SHA-1', 'query': sha1}, 404
if stats:
rdb.zincrby("s:exist:sha1", score, k)
if stats_pubsub:
pub_lookup(channel='exist', k=k)
if session and ttl is not False:
2022-01-15 13:56:47 +00:00
session_key = "session:{}:exist".format(
request.headers.get('hashlookup_session')
)
rdb.sadd(session_key, k)
rdb.expire(session_key, ttl)
h = rdb.hgetall("h:{}".format(k))
if "OpSystemCode" in h:
if rdb.exists("h-OpSystemCode:{}".format(h['OpSystemCode'])):
2022-01-15 13:56:47 +00:00
h['OpSystemCode'] = rdb.hgetall(
"h-OpSystemCode:{}".format(h['OpSystemCode'])
)
if "ProductCode" in h:
if rdb.exists("h-ProductCode:{}".format(h['ProductCode'])):
2022-01-15 13:56:47 +00:00
h['ProductCode'] = rdb.hgetall(
"h-ProductCode:{}".format(h['ProductCode'])
)
2021-08-22 21:23:52 +00:00
if rdb.exists("p:{}".format(k)):
parents = []
card = rdb.scard("p:{}".format(k))
if card <= 15:
p = rdb.smembers("p:{}".format(k))
else:
p = []
p = rdb.srandmember("p:{}".format(k), number=10)
h['hashlookup:parent-total'] = card
for parent in p:
2021-08-22 21:23:52 +00:00
parent_details = rdb.hgetall("h:{}".format(parent))
parents.append(parent_details)
h['parents'] = parents
if rdb.exists("c:{}".format(k)):
children = []
card = rdb.scard("c:{}".format(k))
if card <= 15:
c = rdb.smembers("c:{}".format(k))
else:
c = rdb.srandmember("c:{}".format(k), number=10)
h['hashlookup:children-total'] = card
for child in c:
child_details = rdb.hgetall("h:{}".format(child))
children.append(child_details)
h['children'] = children
h = calculate_trust(hobject=h)
return h
2022-01-15 13:56:47 +00:00
@api.route('/lookup/sha256/<string:sha256>')
@api.doc(description="Lookup SHA-256.")
class lookup(Resource):
def get(self, sha256):
if check_sha256(value=sha256) is False:
2022-01-15 13:56:47 +00:00
return {
'message': 'SHA-256 value incorrect, expecting a SHA-256 value in hex format'
}, 400
k = check_sha256(value=sha256)
ttl = False
if session:
ttl = get_session()
if not (rdb.exists("l:{}".format(k)) or rdb.exists("h:{}".format(k))):
if stats:
rdb.zincrby("s:nx:sha256", score, k)
if stats_pubsub:
pub_lookup(channel='nx', k=k)
if session and ttl is not False:
2022-01-15 13:56:47 +00:00
session_key = "session:{}:nx".format(
request.headers.get('hashlookup_session')
)
rdb.sadd(session_key, k)
rdb.expire(session_key, ttl)
return {'message': 'Non existing SHA-256', 'query': sha256}, 404
if stats:
rdb.zincrby("s:exist:sha256", score, k)
if stats_pubsub:
pub_lookup(channel='exist', k=k)
if session and ttl is not False:
2022-01-15 13:56:47 +00:00
session_key = "session:{}:exist".format(
request.headers.get('hashlookup_session')
)
rdb.sadd(session_key, k)
rdb.expire(session_key, ttl)
if rdb.exists("h:{}".format(k)) and not rdb.exists("l:{}".format(k)):
h = rdb.hgetall("h:{}".format(k))
sha1 = k
else:
sha1 = rdb.get("l:{}".format(k))
h = rdb.hgetall("h:{}".format(sha1))
if "OpSystemCode" in h:
if rdb.exists("h-OpSystemCode:{}".format(h['OpSystemCode'])):
2022-01-15 13:56:47 +00:00
h['OpSystemCode'] = rdb.hgetall(
"h-OpSystemCode:{}".format(h['OpSystemCode'])
)
if "ProductCode" in h:
if rdb.exists("h-ProductCode:{}".format(h['ProductCode'])):
2022-01-15 13:56:47 +00:00
h['ProductCode'] = rdb.hgetall(
"h-ProductCode:{}".format(h['ProductCode'])
)
if rdb.exists("p:{}".format(sha1)):
parents = []
card = rdb.scard("p:{}".format(sha1))
if card <= 15:
p = rdb.smembers("p:{}".format(sha1))
else:
p = rdb.srandmember("p:{}".format(sha1), number=10)
h['hashlookup:parent-total'] = card
for parent in p:
parent_details = rdb.hgetall("h:{}".format(parent))
parents.append(parent_details)
h['parents'] = parents
if rdb.exists("c:{}".format(sha1)):
children = []
card = rdb.scard("c:{}".format(sha1))
if card <= 15:
c = rdb.smembers("c:{}".format(sha1))
else:
c = rdb.srandmember("c:{}".format(sha1), number=10)
h['hashlookup:children-total'] = card
for child in c:
child_details = rdb.hgetall("h:{}".format(child))
children.append(child_details)
h['children'] = children
h = calculate_trust(hobject=h)
return h
new: [server] /children and /parents end-points added The two new endpoints `children` and `parents` allow to paginate over the large-set of parents or children. - The first value is the SHA1 value having children or parents. - The second value is the number of elements to get (by default is 100 if the value is set to 0). - The third value is the cursor to paginate over the element (for starting the cursor must be set to 0). A sample usage: ~~~~ adulau@kolmogorov ~ $ curl -s http://127.0.0.1:5000/children/31C43D24d696BC5F5309CCBFA5BDEF65A7170439/10/0 | jq . { "children": [ "003587440172055C75130EF1A063C3BB050C3251", "007C1E16B3F0F2E48C114E458308397953C7D224", "014D1060C674FBBCEAFFD94B85D60AD00618B56B", "01A2FACD61D157FC80DD0C5F6B525CC9EDE4B6DE", "01D1A98F559966A05923A74EE239C6BBEEB0FDAC", "01D381F2FCDD1BDF642AF83C9E96083F2C8D1C03", "02B37BA21D1831C120C1C9C1D41893B4DB424EE7", "02DED521ADCF17AA8818EA1142F63E05F558E668", "0364E0EFE65D9B6502084813189B4D888C117859", "05C9A276A0E03F7A5F99DE5CC8911583FD8FD60E" ], "cursor": "05C9A276A0E03F7A5F99DE5CC8911583FD8FD60E", "total": 774 } adulau@kolmogorov ~ $ curl -s http://127.0.0.1:5000/children/31C43D24d696BC5F5309CCBFA5BDEF65A7170439/10/05C9A276A0E03F7A5F99DE5CC8911583FD8FD60E | jq . { "children": [ "063EC5526DA21372D77AFC3C40F694478521829B", "0647EA948ED37383F74CC68A94E2DC3CBC2A9E4E", "0648AAAC06A76A58CB1E999882447BBDEEA42C57", "06A62F10F269824FFD75A917A35ACD3F2461981C", "0727FE9E2437B15B3F879C7617973AE11E55BA13", "074A0CA7131AE8FD9665CFE68A0C124EB6AD0170", "075B11AE383071BDA9BE66E336C916F6E6E1F49C", "081A336DE7D636F95F0150B7708C614592CBBDAE", "08DF546EE44D4B7546FCE5A7B7E284CA35F1B059", "0947CE713B69C2318CA684BBB63912621CC17A6A" ], "cursor": "0947CE713B69C2318CA684BBB63912621CC17A6A", "total": 774 } ~~~~
2022-05-21 15:43:24 +00:00
@api.route('/parents/<string:sha1>/<int:count>/<string:cursor>')
@api.doc(
description="Return parents from a given SHA1. A number of element to return and an offset must be given. If not set it will be the 100 first elements. A cursor must be given to paginate over. The starting cursor is 0."
)
class parents(Resource):
def get(self, sha1, count, cursor):
if check_sha1(value=sha1) is False:
return {
'message': 'SHA1 value incorrect, expecting a SHA1 value in hex format.'
}, 400
sha1 = check_sha1(value=sha1)
if not count:
count = 100
if not cursor:
cursor = 0
if not rdb.exists("p:{}".format(sha1)):
return {'message': 'The SHA1 value has no known parent.'}, 404
parents = []
cursor, parents = rdb.sscan("p:{}".format(sha1), count=count, cursor=cursor)
h = {}
h['parents'] = parents
h['cursor'] = cursor
h['total'] = rdb.scard("p:{}".format(sha1))
return h
@api.route('/children/<string:sha1>/<int:count>/<string:cursor>')
@api.doc(
description="Return children from a given SHA1. A number of element to return and an offset must be given. If not set it will be the 100 first elements. A cursor must be given to paginate over. The starting cursor is 0."
)
class children(Resource):
def get(self, sha1, count, cursor):
if check_sha1(value=sha1) is False:
return {
'message': 'SHA1 value incorrect, expecting a SHA1 value in hex format.'
}, 400
sha1 = check_sha1(value=sha1)
if not count:
count = 100
if not cursor:
cursor = 0
if not rdb.exists("c:{}".format(sha1)):
return {'message': 'The SHA1 value has no known child.'}, 404
children = []
cursor, children = rdb.sscan("c:{}".format(sha1), count=count, cursor=cursor)
h = {}
h['children'] = children
h['cursor'] = cursor
h['total'] = rdb.scard("c:{}".format(sha1))
return h
@api.route('/info')
@api.doc(description="Info about the hashlookup database")
class info(Resource):
def get(self):
info = {}
lookup = rdb.info()
info['nsrl-version'] = rdb.get('nsrl-version')
info['stat:hashlookup_total_keys'] = lookup['estimate_keys[default]']
info['stat:nsrl_modern_rds'] = rdb.get('stat:nsrl_modern_rds')
info['stat:nsrl_legacy'] = rdb.get('stat:nsrl_legacy')
info['stat:nsrl_ios'] = rdb.get('stat:nsrl_ios')
info['stat:nsrl_android'] = rdb.get('stat:nsrl_android')
info['hashlookup-version'] = version
return info
2022-01-15 13:56:47 +00:00
@api.route('/bulk/md5')
2022-01-15 13:56:47 +00:00
@api.doc(
description="Bulk search of MD5 hashes in a JSON array with the key \'hashes\'."
)
class bulkmd5(Resource):
def post(self):
json_data = request.get_json(force=True)
if not 'hashes' in json_data:
2022-01-15 13:56:47 +00:00
return {
'message': 'JSON format incorrect. An array of hashes in the key \'hashes\' is expected.'
}, 404
ret = []
for val in json_data['hashes']:
k = val.upper()
if check_md5(value=k) is False:
continue
if not rdb.exists("l:{}".format(k)):
if stats_pubsub:
pub_lookup(channel='nx', k=k)
continue
sha1 = rdb.get("l:{}".format(k))
ret.append(rdb.hgetall("h:{}".format(sha1)))
if stats:
rdb.zincrby("s:exist:sha1", score, k)
if stats_pubsub:
pub_lookup(channel='exist', k=k)
return ret
2022-01-15 13:56:47 +00:00
@api.route('/bulk/sha1')
@api.doc(description="Bulk search of SHA1 hashes in a JSON array with the \'hashes\'.")
class bulksha1(Resource):
def post(self):
json_data = request.get_json(force=True)
if not 'hashes' in json_data:
2022-01-15 13:56:47 +00:00
return {
'message': 'JSON format incorrect. An array of hashes in the key \'hashes\' is expected.'
}, 404
ret = []
for val in json_data['hashes']:
k = val.upper()
if check_sha1(value=k) is False:
continue
if not rdb.exists("h:{}".format(k)):
if stats_pubsub:
pub_lookup(channel='nx', k=k)
continue
k = val.upper()
ret.append(rdb.hgetall("h:{}".format(k)))
if stats:
rdb.zincrby("s:exist:sha1", score, k)
if stats_pubsub:
pub_lookup(channel='exist', k=k)
return ret
2022-01-15 13:56:47 +00:00
@api.route('/session/create/<string:name>')
2022-01-15 13:56:47 +00:00
@api.doc(
description="Create a session key to keep search context. The session is attached to a name. After the session is created, the header `hashlookup_session` can be set to the session name."
)
class sessioncreate(Resource):
def get(self, name):
if name is None or len(name) > 120:
return {'message': 'Expecting a name for the session'}, 400
if session is False:
return {'message': 'Session feature is not enabled'}, 500
rdb.set('session:{}'.format(name), str(client_info()))
rdb.expire('session:{}'.format(name), session_ttl)
2022-01-15 13:56:47 +00:00
return {
'message': 'Session {} created and session will expire in {} seconds'.format(
name, session_ttl
)
}
@api.route('/session/get/<string:name>')
@api.doc(description="Return set of matching and non-matching hashes from a session.")
class sessioncreate(Resource):
def get(self, name):
if name is None or len(name) > 120:
return {'message': 'Expecting a name for the session'}, 400
if session is False:
return {'message': 'Session feature is not enabled'}, 500
if not rdb.exists('session:{}'.format(name)):
return {'message': 'Non-existing session'}, 404
nx = rdb.smembers('session:{}:nx'.format(name))
exist = rdb.smembers('session:{}:exist'.format(name))
ret = {}
ret['nx'] = list(nx)
ret['exist'] = list(exist)
ret['info'] = rdb.get('session:{}'.format(name))
return ret
2022-01-15 13:56:47 +00:00
@api.route('/stats/top')
@api.doc(description="Return the top 100 of most queried values.")
class stattop(Resource):
def get(self):
if stats_public is False:
return {'message': 'Public statistics not enabled'}, 400
ret = {}
ret['nx'] = rdb.zrevrange("s:nx:sha1", 0, 100, withscores=True)
for val in ret['nx']:
if rdb.exists("h:{}".format(val[0])):
ret['nx'].remove(val)
exist = rdb.zrevrange("s:exist:sha1", 0, 100, withscores=True)
ret['exist'] = []
for value in exist:
name = rdb.hget("h:{}".format(value[0]), "FileName")
entry = {}
entry['FileName'] = name
entry['SHA-1'] = value
ret['exist'].append(entry)
return ret
2022-01-15 13:56:47 +00:00
if __name__ == '__main__':
2022-01-15 13:56:47 +00:00
app.run(host='0.0.0.0')