dcu-tools/bin/dcu-fetch.py

# -*- coding: utf-8 -*-
#
# dcu-fetch.py is a Python script to fetch blobs published by DCU
#
# Standard python libraries (>=2.6) + Azure Python SDK
#
# Copyright (C) 2013 Alexandre Dulaunoy - alexandre.dulaunoy@circl.lu
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#

import gzip
import StringIO
from azure.storage import BlobService
from optparse import OptionParser
import sys
import json

debug = False
header = True
output_format = "txt"

headers = ["SourcedFrom","FileTimeUtc", "Botnet", "SourceIp", "SourcePort", "SourceIpAsnNr", "TargetIp", "TargetPort", "Payload", "SourceIpCountryCode", "SourceIpRegion", "SourceIpCity", "SourceIpPostalCode", "SourceIpLatitude", "SourceIpLongitude", "SourceIpMetroCode", "SourceIpAreaCode", "HttpRequest", "HttpReferrer", "HttpUserAgent", "HttpMethod", "HttpVersion", "HttpHost", "Custom Field 1", "Custom Field 2", "Custom Field 3", "Custom Field 4", "Custom Field 5"]

usage = "usage: %prog [options] dcu feed blob fetcher"
parser = OptionParser(usage)
parser.add_option("-d","--debug", dest="debug", action='store_true', help="output debug message on stderr")
parser.add_option("-a","--account_name", dest="account_name", help="Microsoft Azure account name")
parser.add_option("-k","--account_key", dest="account_key", help="Microsoft Azure key to access DCU container")
parser.add_option("-c","--clear", dest="clear", action='store_true', help="Delete blobs and containers after fetching")
parser.add_option("-e","--header", dest="header", action='store_true', help="Remove field header in the output (default is displayed)")
parser.add_option("-f","--format", dest="output_format", help="output txt, json (default is txt)")
parser.add_option("-t","--date", dest="date", help="date in format YYYY-MM-DD to limit the query (default is all)")

(options, args) = parser.parse_args()

if options.debug:
    debug=True

if options.header:
    header=False

if options.output_format:
    output_format = options.output_format

if options.account_name:
    account_name=options.account_name
else:
    sys.stderr.write("Azure account name is missing")
    sys.exit(1)

if options.account_key:
    account_key=options.account_key
else:
    sys.stderr.write("Azure key is missing")
    sys.exit(1)

if header and not options.output_format:
    print '\t'.join(str(h) for h in headers)

blob_service = BlobService(account_name, account_key)

for container in blob_service.list_containers():
    c = container.name
    if c == "heartbeat": continue
    if options.date and not ( c == "processed-"+options.date ): continue
    if debug: sys.stderr.write("Processing container: "+str(c)+"\n")
    for b in blob_service.list_blobs(c):
        if debug: sys.stderr.write("Processing blob: "+str(b.name)+"\n")
        data = blob_service.get_blob(c, b.name)
        cs = StringIO.StringIO(data)
        gzipstream = gzip.GzipFile(fileobj=cs)
        if output_format == "txt":
            print gzipstream.read()
        elif output_format == "json":
            d = {}
            i = 0
            ds = gzipstream.read()
            # some DCU entries contains more than 28 values (outside the
            # definition of the headers)
            for x in ds.strip().split("\t")[:27]:
                d[headers[i]] = x
                i=i+1
            print (json.dumps(d, sort_keys=True))
        if options.clear:
            if debug: sys.stderr.write("Deleting blob: "+str(b.name)+"\n")
            blob_service.delete_blob(c, b.name)
    if options.clear:
        if debug: sys.stderr.write("Deleting container: "+str(c)+"\n")
        blob_service.delete_container(c)
First version to fetch, uncompress and dump DCU blobs. By default, it's the raw format (txt) dump from DCU. You can also dump JSON format. Usage: dcu-fetch.py [options] dcu feed blob fetcher Options: -h, --help show this help message and exit -d, --debug output debug message on stderr -a ACCOUNT_NAME, --account_name=ACCOUNT_NAME Microsoft Azure account name -k ACCOUNT_KEY, --account_key=ACCOUNT_KEY Microsoft Azure key to access DCU container -c, --clear Delete blobs and containers after fetching -e, --header Remove field header in the output (default is displayed) -f OUTPUT_FORMAT, --format=OUTPUT_FORMAT output txt, json (default is txt) 2013-02-04 17:46:29 +00:00			`# -- coding: utf-8 --`
			`#`
			`# dcu-fetch.py is a Python script to fetch blobs published by DCU`
			`#`
			`# Standard python libraries (>=2.6) + Azure Python SDK`
			`#`
			`# Copyright (C) 2013 Alexandre Dulaunoy - alexandre.dulaunoy@circl.lu`
			`#`
			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`#`

			`import gzip`
			`import StringIO`
			`from azure.storage import BlobService`
			`from optparse import OptionParser`
			`import sys`
			`import json`

			`debug = False`
			`header = True`
			`output_format = "txt"`

			`headers = ["SourcedFrom","FileTimeUtc", "Botnet", "SourceIp", "SourcePort", "SourceIpAsnNr", "TargetIp", "TargetPort", "Payload", "SourceIpCountryCode", "SourceIpRegion", "SourceIpCity", "SourceIpPostalCode", "SourceIpLatitude", "SourceIpLongitude", "SourceIpMetroCode", "SourceIpAreaCode", "HttpRequest", "HttpReferrer", "HttpUserAgent", "HttpMethod", "HttpVersion", "HttpHost", "Custom Field 1", "Custom Field 2", "Custom Field 3", "Custom Field 4", "Custom Field 5"]`

			`usage = "usage: %prog [options] dcu feed blob fetcher"`
			`parser = OptionParser(usage)`
			`parser.add_option("-d","--debug", dest="debug", action='store_true', help="output debug message on stderr")`
			`parser.add_option("-a","--account_name", dest="account_name", help="Microsoft Azure account name")`
			`parser.add_option("-k","--account_key", dest="account_key", help="Microsoft Azure key to access DCU container")`
			`parser.add_option("-c","--clear", dest="clear", action='store_true', help="Delete blobs and containers after fetching")`
			`parser.add_option("-e","--header", dest="header", action='store_true', help="Remove field header in the output (default is displayed)")`
			`parser.add_option("-f","--format", dest="output_format", help="output txt, json (default is txt)")`
date option added to limit the query of containers date in format YYYY-MM-DD to limit the query (default is all) 2013-02-07 15:38:18 +00:00			`parser.add_option("-t","--date", dest="date", help="date in format YYYY-MM-DD to limit the query (default is all)")`
First version to fetch, uncompress and dump DCU blobs. By default, it's the raw format (txt) dump from DCU. You can also dump JSON format. Usage: dcu-fetch.py [options] dcu feed blob fetcher Options: -h, --help show this help message and exit -d, --debug output debug message on stderr -a ACCOUNT_NAME, --account_name=ACCOUNT_NAME Microsoft Azure account name -k ACCOUNT_KEY, --account_key=ACCOUNT_KEY Microsoft Azure key to access DCU container -c, --clear Delete blobs and containers after fetching -e, --header Remove field header in the output (default is displayed) -f OUTPUT_FORMAT, --format=OUTPUT_FORMAT output txt, json (default is txt) 2013-02-04 17:46:29 +00:00
			`(options, args) = parser.parse_args()`

			`if options.debug:`
			`debug=True`

			`if options.header:`
			`header=False`

			`if options.output_format:`
			`output_format = options.output_format`

			`if options.account_name:`
			`account_name=options.account_name`
			`else:`
			`sys.stderr.write("Azure account name is missing")`
			`sys.exit(1)`

			`if options.account_key:`
			`account_key=options.account_key`
			`else:`
			`sys.stderr.write("Azure key is missing")`
			`sys.exit(1)`

			`if header and not options.output_format:`
			`print '\t'.join(str(h) for h in headers)`

			`blob_service = BlobService(account_name, account_key)`
date option added to limit the query of containers date in format YYYY-MM-DD to limit the query (default is all) 2013-02-07 15:38:18 +00:00
First version to fetch, uncompress and dump DCU blobs. By default, it's the raw format (txt) dump from DCU. You can also dump JSON format. Usage: dcu-fetch.py [options] dcu feed blob fetcher Options: -h, --help show this help message and exit -d, --debug output debug message on stderr -a ACCOUNT_NAME, --account_name=ACCOUNT_NAME Microsoft Azure account name -k ACCOUNT_KEY, --account_key=ACCOUNT_KEY Microsoft Azure key to access DCU container -c, --clear Delete blobs and containers after fetching -e, --header Remove field header in the output (default is displayed) -f OUTPUT_FORMAT, --format=OUTPUT_FORMAT output txt, json (default is txt) 2013-02-04 17:46:29 +00:00			`for container in blob_service.list_containers():`
			`c = container.name`
			`if c == "heartbeat": continue`
date option added to limit the query of containers date in format YYYY-MM-DD to limit the query (default is all) 2013-02-07 15:38:18 +00:00			`if options.date and not ( c == "processed-"+options.date ): continue`
First version to fetch, uncompress and dump DCU blobs. By default, it's the raw format (txt) dump from DCU. You can also dump JSON format. Usage: dcu-fetch.py [options] dcu feed blob fetcher Options: -h, --help show this help message and exit -d, --debug output debug message on stderr -a ACCOUNT_NAME, --account_name=ACCOUNT_NAME Microsoft Azure account name -k ACCOUNT_KEY, --account_key=ACCOUNT_KEY Microsoft Azure key to access DCU container -c, --clear Delete blobs and containers after fetching -e, --header Remove field header in the output (default is displayed) -f OUTPUT_FORMAT, --format=OUTPUT_FORMAT output txt, json (default is txt) 2013-02-04 17:46:29 +00:00			`if debug: sys.stderr.write("Processing container: "+str(c)+"\n")`
			`for b in blob_service.list_blobs(c):`
			`if debug: sys.stderr.write("Processing blob: "+str(b.name)+"\n")`
			`data = blob_service.get_blob(c, b.name)`
			`cs = StringIO.StringIO(data)`
			`gzipstream = gzip.GzipFile(fileobj=cs)`
			`if output_format == "txt":`
			`print gzipstream.read()`
			`elif output_format == "json":`
			`d = {}`
			`i = 0`
			`ds = gzipstream.read()`
			`# some DCU entries contains more than 28 values (outside the`
			`# definition of the headers)`
			`for x in ds.strip().split("\t")[:27]:`
			`d[headers[i]] = x`
			`i=i+1`
remove json_util dependency 2013-02-07 09:58:23 +00:00			`print (json.dumps(d, sort_keys=True))`
First version to fetch, uncompress and dump DCU blobs. By default, it's the raw format (txt) dump from DCU. You can also dump JSON format. Usage: dcu-fetch.py [options] dcu feed blob fetcher Options: -h, --help show this help message and exit -d, --debug output debug message on stderr -a ACCOUNT_NAME, --account_name=ACCOUNT_NAME Microsoft Azure account name -k ACCOUNT_KEY, --account_key=ACCOUNT_KEY Microsoft Azure key to access DCU container -c, --clear Delete blobs and containers after fetching -e, --header Remove field header in the output (default is displayed) -f OUTPUT_FORMAT, --format=OUTPUT_FORMAT output txt, json (default is txt) 2013-02-04 17:46:29 +00:00			`if options.clear:`
			`if debug: sys.stderr.write("Deleting blob: "+str(b.name)+"\n")`
			`blob_service.delete_blob(c, b.name)`
			`if options.clear:`
			`if debug: sys.stderr.write("Deleting container: "+str(c)+"\n")`
			`blob_service.delete_container(c)`