Eine Spielerei aus der nice-2-have Ecke: Speichert ALLE per HTTP auf Port 80 übertragenen Dateien in Echtzeit. Zum Beispiel FLV-Filme und MP3-Dateien.
#!/bin/
from pcap import pcap
import dpkt
import sys
import socket
import urllib2
import os
import tempfile
from StringIO import StringIO
import re
# Command line handling
if len(sys.argv) < 2:
print "http.store.py - Use PCAP to store all HTTP downloads to disc."
print "Copyright (c) 2009, Phillip Berndt"
print
print "Syntax: http.store.py [-n] <interface|dumpFile> [filter-regex]"
print
print "Note: I will store all data in memory, even if it will be filtered"
print " Don't download large files while running this program."
print
sys.exit(0)
dumpFile = sys.argv[1]
if not os.access(dumpFile, os.F_OK):
print "File %s not found. Assuming it's a device's name" % dumpFile
print "Starting dumpcap (from wireshark). Press Ctrl-c to end capturing."
print "You should wait ~15 seconds after your last download before ending it!"
print
device = dumpFile
dumpFile = tempfile.mktemp()
dumpcap = os.popen("dumpcap -i %s -f 'tcp port 80' -w %r" % (device, dumpFile))
try:
dumpcap.read()
except:
dumpcap.close()
if len(sys.argv) > 2:
FILTER = sys.argv[2]
else:
FILTER = False
# Setup fake HTTP class for conversion, so we can use
# urllib to parse the responses
fakeSocketReaderFile = False
class fakeSocket():
def _foo(*x): pass
def __init__(self, *x):
for fn in [
'accept', 'bind', 'close', 'connect', 'connect_ex', 'dup', 'family', 'fileno', 'getpeername',
'getsockname', 'getsockopt', 'gettimeout', 'listen', 'proto', 'recv_into',
'recvfrom', 'recvfrom_into', 'send', 'sendall', 'sendto', 'setblocking', 'setsockopt', 'settimeout',
'shutdown' ]:
setattr(self, fn, self._foo)
def makefile(*x):
return fakeSocketReaderFile
def recv(self, toRead):
return fakeSocketReaderFile.read(toRead)
urllib2.socket.socket = fakeSocket
# Process a connection
def process(conn):
global fakeSocketReaderFile
fromData = reduce(lambda x,y: x + y[1], sorted(conn["from"].items()), "")
toData = reduce(lambda x,y: x + y[1], sorted(conn["to"].items()), "")
file = re.search("(?:GET|POST) (/[^ ]*) HTTP", toData)
host = re.search("Host: (.+)", toData)
if not (file and host):
print "Received something weird: "
print toData
print "---"
return
uri = "http://" + host.group(1).strip() + re.sub("%([0-9A-F]{2})", lambda x: chr(int(x.group(1), 16)), file.group(1)).strip()
print "Processing URI ", uri, ": ",
sys.stdout.flush()
if not re.search("HTTP.+ 200", fromData):
print "Replied non 200. Ignoring"
return
if FILTER:
if not re.search(FILTER, uri):
print "filtered"
return
fileName = os.path.basename(uri)
if not fileName:
fileName = "index"
fileName = fileName[:250]
if os.access(fileName, os.F_OK):
add = 0
while(os.access("%s~%d" % (fileName, add), os.F_OK)): add += 1
fileName = "%s~%d" % (fileName, add)
fakeSocketReaderFile = StringIO(fromData)
open(fileName, "w").write(urllib2.urlopen("http://127.0.0.1/" + fileName).read())
fileIsA = os.popen("file %r" % fileName).read()
if "gzip" in fileIsA:
os.rename(fileName, fileName + ".gz")
os.system("gzip -d %r" % fileName)
if "bzip" in fileIsA:
os.rename(fileName, fileName + ".bz")
os.system("bzip2 -d %r" % fileName)
fakeSocketReaderFile = False
print "ok -> ", fileName
# Open data stream
print "Opening dumpFile %s" % dumpFile
listener = pcap(dumpFile)
listener.setfilter("tcp port 80")
os.unlink(dumpFile)
connections = {}
shift = 0
for ts, pkt in listener:
# Extract TCP packet
# Heuristic to avoid having to implement all transport layer formats
firstTry = True
tcp = None
while True:
try:
ip = dpkt.ip.IP(pkt[shift:])
assert(ip.p == 6)
tcp = ip.data
tcp.dport == 1
except:
tcp = None
if firstTry:
shift = 0
firstTry = False
continue
if tcp and (tcp.dport == 80 or tcp.sport == 80): break
if shift > len(pkt): break
shift += 1
if not tcp or tcp.dport != 80 and tcp.sport != 80:
continue
# Generate a unique identifier
identifier = (tcp.sport if tcp.sport != 80 else tcp.dport)
# Store data in connections
if tcp.data:
if identifier not in connections:
connections[identifier] = { "from": {}, "to": {}, "state": 0, "fromlen": 0 }
#if tcp.seq in connections[identifier][("from" if tcp.sport == 80 else "to")]:
# print "WARNING: Double content"
#connections[identifier][("from" if tcp.sport == 80 else "to")][tcp.seq] = tcp.data
if tcp.sport == 80:
connections[identifier]["fromlen"] += len(tcp.data)
if tcp.seq not in connections[identifier][("from" if tcp.sport == 80 else "to")]:
connections[identifier][("from" if tcp.sport == 80 else "to")][tcp.seq] = ""
connections[identifier][("from" if tcp.sport == 80 else "to")][tcp.seq] += tcp.data
if identifier not in connections:
continue
# If the connection is closed, process it
if tcp.flags & dpkt.tcp.TH_ACK and connections[identifier]["state"] & (2 if tcp.sport == 80 else 1):
connections[identifier]["state"] |= (1 if tcp.sport == 80 else 2) << 2
if tcp.flags & (dpkt.tcp.TH_FIN | dpkt.tcp.TH_RST):
connections[identifier]["state"] |= 1 if tcp.sport == 80 else 2
if connections[identifier]["state"] == 15:
print "Received %d bytes" % connections[identifier]["fromlen"]
process(connections[identifier])
del connections[identifier]