#!/bin/
from pcap import pcap
import dpkt
import sys
import socket
import urllib2
import os
import tempfile
from StringIO import StringIO
import re

# Command line handling
if len(sys.argv) < 2:
	print "http.store.py - Use PCAP to store all HTTP downloads to disc."
	print "Copyright (c) 2009, Phillip Berndt"
	print
	print "Syntax: http.store.py [-n] <interface|dumpFile> [filter-regex]"
	print
	print "Note: I will store all data in memory, even if it will be filtered"
	print " Don't download large files while running this program."
	print
	sys.exit(0)

dumpFile = sys.argv[1]
if not os.access(dumpFile, os.F_OK):
	print "File %s not found. Assuming it's a device's name" % dumpFile
	print "Starting dumpcap (from wireshark). Press Ctrl-c to end capturing."
	print "You should wait ~15 seconds after your last download before ending it!"
	print
	device = dumpFile
	dumpFile = tempfile.mktemp()
	dumpcap = os.popen("dumpcap -i %s -f 'tcp port 80' -w %r" % (device, dumpFile))
	try:
		dumpcap.read()
	except:
		dumpcap.close()

if len(sys.argv) > 2:
	FILTER = sys.argv[2]
else:
	FILTER = False

# Setup fake HTTP class for conversion, so we can use
# urllib to parse the responses
fakeSocketReaderFile = False
class fakeSocket():
	def _foo(*x): pass
	def __init__(self, *x):
		for fn in [
			'accept', 'bind', 'close', 'connect', 'connect_ex', 'dup', 'family', 'fileno', 'getpeername',
			'getsockname', 'getsockopt', 'gettimeout', 'listen', 'proto', 'recv_into',
			'recvfrom', 'recvfrom_into', 'send', 'sendall', 'sendto', 'setblocking', 'setsockopt', 'settimeout',
			'shutdown' ]:
			setattr(self, fn, self._foo)
	def makefile(*x):
		return fakeSocketReaderFile
	def recv(self, toRead):
		return fakeSocketReaderFile.read(toRead)
urllib2.socket.socket = fakeSocket

# Process a connection
def process(conn):
	global fakeSocketReaderFile
	fromData = reduce(lambda x,y: x + y[1], sorted(conn["from"].items()), "")
	toData   = reduce(lambda x,y: x + y[1], sorted(conn["to"].items()), "")

	file = re.search("(?:GET|POST) (/[^ ]*) HTTP", toData)
	host = re.search("Host: (.+)", toData)
	if not (file and host):
		print "Received something weird: "
		print toData
		print "---"
		return
	uri = "http://" + host.group(1).strip() + re.sub("%([0-9A-F]{2})", lambda x: chr(int(x.group(1), 16)), file.group(1)).strip()

	print "Processing URI ", uri, ": ",
	sys.stdout.flush()
	if not re.search("HTTP.+ 200", fromData):
		print "Replied non 200. Ignoring"
		return
	if FILTER:
		if not re.search(FILTER, uri):
			print "filtered"
			return
	
	fileName = os.path.basename(uri)
	if not fileName:
		fileName = "index"
	fileName = fileName[:250]
	if os.access(fileName, os.F_OK):
		add = 0
		while(os.access("%s~%d" % (fileName, add), os.F_OK)): add += 1
		fileName = "%s~%d" % (fileName, add)
	fakeSocketReaderFile = StringIO(fromData)
	open(fileName, "w").write(urllib2.urlopen("http://127.0.0.1/" + fileName).read())
	fileIsA = os.popen("file %r" % fileName).read()
	if "gzip" in fileIsA:
		os.rename(fileName, fileName + ".gz")
		os.system("gzip -d %r" % fileName)
	if "bzip" in fileIsA:
		os.rename(fileName, fileName + ".bz")
		os.system("bzip2 -d %r" % fileName)
	fakeSocketReaderFile = False

	print "ok -> ", fileName

# Open data stream
print "Opening dumpFile %s" % dumpFile
listener = pcap(dumpFile)
listener.setfilter("tcp port 80")
os.unlink(dumpFile)
connections = {}

shift = 0
for ts, pkt in listener:
	# Extract TCP packet
	# Heuristic to avoid having to implement all transport layer formats
	firstTry = True
	tcp = None
	while True:
		try:
			ip = dpkt.ip.IP(pkt[shift:])
			assert(ip.p == 6)
			tcp = ip.data
			tcp.dport == 1
		except:
			tcp = None
		if firstTry:
			shift = 0
			firstTry = False
			continue
		if tcp and (tcp.dport == 80 or tcp.sport == 80): break
		if shift > len(pkt): break
		shift += 1
	if not tcp or tcp.dport != 80 and tcp.sport != 80:
		continue

	# Generate a unique identifier
	identifier = (tcp.sport if tcp.sport != 80 else tcp.dport)

	# Store data in connections
	if tcp.data:
		if identifier not in connections:
			connections[identifier] = { "from": {}, "to": {}, "state": 0, "fromlen": 0 }
		#if tcp.seq in connections[identifier][("from" if tcp.sport == 80 else "to")]:
		#	print "WARNING: Double content"
		#connections[identifier][("from" if tcp.sport == 80 else "to")][tcp.seq] = tcp.data
		if tcp.sport == 80:
			connections[identifier]["fromlen"] += len(tcp.data)
		if tcp.seq not in connections[identifier][("from" if tcp.sport == 80 else "to")]:
			connections[identifier][("from" if tcp.sport == 80 else "to")][tcp.seq] = ""
		connections[identifier][("from" if tcp.sport == 80 else "to")][tcp.seq] += tcp.data

	if identifier not in connections:
		continue
	
	# If the connection is closed, process it
	if tcp.flags & dpkt.tcp.TH_ACK and connections[identifier]["state"] & (2 if tcp.sport == 80 else 1):
		connections[identifier]["state"] |= (1 if tcp.sport == 80 else 2) << 2
	if tcp.flags & (dpkt.tcp.TH_FIN | dpkt.tcp.TH_RST):
		connections[identifier]["state"] |= 1 if tcp.sport == 80 else 2
	if connections[identifier]["state"] == 15:
		print "Received %d bytes" % connections[identifier]["fromlen"]
		process(connections[identifier])
		del connections[identifier]
