Ein simpler Spider, der eine Sitemap erstellen kann.
<?php
/*
Ein Crawler, der eine Sitemap generiert
Aktion: PHP Scripte für die armen dieser Welt
Der Erlös geht für mein Pausenbrot drauf
Copyright (c) 2006 by Phillip 'Firebird' Berndt
*/
?>
<h1>Sitemap Generator</h1>
<form method="post">
<p>
URL:
<input type="text" name="url" />
<input type="submit" />
</p>
</form>
<hr/>
<ul>
<?php
ob_implicit_flush();
if(isset($_POST['url']))
crawlURL($_POST['url']);
$urlList = array();
function crawlURL($url)
{
if(isset($GLOBALS['urlList'][$url]))
{
echo('<li><a href="#a'.$GLOBALS['urlList'][$url][0].'">'.$GLOBALS['urlList'][$url][1].'</a></li>');
return;
}
$dirs = parse_url($url);
$basedir = $dirs['scheme'].'://'.$dirs['host'];
$subdir = $basedir.dirname($dirs['path']);
$ctr = count($GLOBALS['urlList']);
$data = file_get_contents($url);
if(preg_match('/<title>(.+?)<\/title>/si', $data, $title))
$title = strip_tags($title[1]);
else
$title = basename($url);
$GLOBALS['urlList'][$url] = array($ctr, $title);
echo('<li id="a'.$ctr.'"><a href="'.htmlspecialchars($url).'">'.htmlspecialchars($title).'</a>');
echo('<ul>');
if(preg_match_all('/<a href=(?:"|\')([^"\']+)/si', $data, $urls))
{
foreach($urls[1] as $suburl)
{
if($suburl[0] == '/')
$suburl = $basedir.$suburl;
elseif(substr($suburl, 0, 7) != 'http://')
$suburl = $subdir.$suburl;
else
{
echo('<li><em>Extern link:</em> <a href="'.htmlspecialchars($suburl).'">'.htmlspecialchars($suburl).'</a></li>');
continue;
}
crawlURL($suburl);
}
}
echo('</ul></li>');
}
?>
</ul>