#!/bin/ from pcap import pcap import dpkt import sys import socket import urllib2 import os import tempfile from StringIO import StringIO import re # Command line handling if len(sys.argv) < 2: print "http.store.py - Use PCAP to store all HTTP downloads to disc." print "Copyright (c) 2009, Phillip Berndt" print print "Syntax: http.store.py [-n] [filter-regex]" print print "Note: I will store all data in memory, even if it will be filtered" print " Don't download large files while running this program." print sys.exit(0) dumpFile = sys.argv[1] if not os.access(dumpFile, os.F_OK): print "File %s not found. Assuming it's a device's name" % dumpFile print "Starting dumpcap (from wireshark). Press Ctrl-c to end capturing." print "You should wait ~15 seconds after your last download before ending it!" print device = dumpFile dumpFile = tempfile.mktemp() dumpcap = os.popen("dumpcap -i %s -f 'tcp port 80' -w %r" % (device, dumpFile)) try: dumpcap.read() except: dumpcap.close() if len(sys.argv) > 2: FILTER = sys.argv[2] else: FILTER = False # Setup fake HTTP class for conversion, so we can use # urllib to parse the responses fakeSocketReaderFile = False class fakeSocket(): def _foo(*x): pass def __init__(self, *x): for fn in [ 'accept', 'bind', 'close', 'connect', 'connect_ex', 'dup', 'family', 'fileno', 'getpeername', 'getsockname', 'getsockopt', 'gettimeout', 'listen', 'proto', 'recv_into', 'recvfrom', 'recvfrom_into', 'send', 'sendall', 'sendto', 'setblocking', 'setsockopt', 'settimeout', 'shutdown' ]: setattr(self, fn, self._foo) def makefile(*x): return fakeSocketReaderFile def recv(self, toRead): return fakeSocketReaderFile.read(toRead) urllib2.socket.socket = fakeSocket # Process a connection def process(conn): global fakeSocketReaderFile fromData = reduce(lambda x,y: x + y[1], sorted(conn["from"].items()), "") toData = reduce(lambda x,y: x + y[1], sorted(conn["to"].items()), "") file = re.search("(?:GET|POST) (/[^ ]*) HTTP", toData) host = re.search("Host: (.+)", toData) if not (file and host): print "Received something weird: " print toData print "---" return uri = "http://" + host.group(1).strip() + re.sub("%([0-9A-F]{2})", lambda x: chr(int(x.group(1), 16)), file.group(1)).strip() print "Processing URI ", uri, ": ", sys.stdout.flush() if not re.search("HTTP.+ 200", fromData): print "Replied non 200. Ignoring" return if FILTER: if not re.search(FILTER, uri): print "filtered" return fileName = os.path.basename(uri) if not fileName: fileName = "index" fileName = fileName[:250] if os.access(fileName, os.F_OK): add = 0 while(os.access("%s~%d" % (fileName, add), os.F_OK)): add += 1 fileName = "%s~%d" % (fileName, add) fakeSocketReaderFile = StringIO(fromData) open(fileName, "w").write(urllib2.urlopen("http://127.0.0.1/" + fileName).read()) fileIsA = os.popen("file %r" % fileName).read() if "gzip" in fileIsA: os.rename(fileName, fileName + ".gz") os.system("gzip -d %r" % fileName) if "bzip" in fileIsA: os.rename(fileName, fileName + ".bz") os.system("bzip2 -d %r" % fileName) fakeSocketReaderFile = False print "ok -> ", fileName # Open data stream print "Opening dumpFile %s" % dumpFile listener = pcap(dumpFile) listener.setfilter("tcp port 80") os.unlink(dumpFile) connections = {} shift = 0 for ts, pkt in listener: # Extract TCP packet # Heuristic to avoid having to implement all transport layer formats firstTry = True tcp = None while True: try: ip = dpkt.ip.IP(pkt[shift:]) assert(ip.p == 6) tcp = ip.data tcp.dport == 1 except: tcp = None if firstTry: shift = 0 firstTry = False continue if tcp and (tcp.dport == 80 or tcp.sport == 80): break if shift > len(pkt): break shift += 1 if not tcp or tcp.dport != 80 and tcp.sport != 80: continue # Generate a unique identifier identifier = (tcp.sport if tcp.sport != 80 else tcp.dport) # Store data in connections if tcp.data: if identifier not in connections: connections[identifier] = { "from": {}, "to": {}, "state": 0, "fromlen": 0 } #if tcp.seq in connections[identifier][("from" if tcp.sport == 80 else "to")]: # print "WARNING: Double content" #connections[identifier][("from" if tcp.sport == 80 else "to")][tcp.seq] = tcp.data if tcp.sport == 80: connections[identifier]["fromlen"] += len(tcp.data) if tcp.seq not in connections[identifier][("from" if tcp.sport == 80 else "to")]: connections[identifier][("from" if tcp.sport == 80 else "to")][tcp.seq] = "" connections[identifier][("from" if tcp.sport == 80 else "to")][tcp.seq] += tcp.data if identifier not in connections: continue # If the connection is closed, process it if tcp.flags & dpkt.tcp.TH_ACK and connections[identifier]["state"] & (2 if tcp.sport == 80 else 1): connections[identifier]["state"] |= (1 if tcp.sport == 80 else 2) << 2 if tcp.flags & (dpkt.tcp.TH_FIN | dpkt.tcp.TH_RST): connections[identifier]["state"] |= 1 if tcp.sport == 80 else 2 if connections[identifier]["state"] == 15: print "Received %d bytes" % connections[identifier]["fromlen"] process(connections[identifier]) del connections[identifier]