我有一个包含大量链接的网页.我想编写一个脚本,将脚本中包含的所有数据转储到本地文件中.
有人用PHP做过吗?一般准则和陷阱就足以作为答案.
咩.不要使用正则表达式解析HTML.
这是一个受Tatu's启发的DOM版本:
loadHTMLFile($url); $anchors = $dom->getElementsByTagName('a'); foreach ($anchors as $element) { $href = $element->getAttribute('href'); if (0 !== strpos($href, 'http')) { $path = '/' . ltrim($href, '/'); if (extension_loaded('http')) { $href = http_build_url($url, array('path' => $path)); } else { $parts = parse_url($url); $href = $parts['scheme'] . '://'; if (isset($parts['user']) && isset($parts['pass'])) { $href .= $parts['user'] . ':' . $parts['pass'] . '@'; } $href .= $parts['host']; if (isset($parts['port'])) { $href .= ':' . $parts['port']; } $href .= dirname($parts['path'], 1).$path; } } crawl_page($href, $depth - 1); } echo "URL:",$url,PHP_EOL,"CONTENT:",PHP_EOL,$dom->saveHTML(),PHP_EOL,PHP_EOL; } crawl_page("http://hobodave.com", 2);
编辑:我修复了Tatu版本的一些错误(现在使用相对URL).
编辑:我添加了一些新功能,阻止它两次跟踪相同的URL.
编辑:现在回显输出到STDOUT,以便您可以将其重定向到您想要的任何文件
编辑:修正了乔治在答案中指出的错误.相对网址将不再附加到网址路径的末尾,而是覆盖它.感谢George为此.请注意,George的答案不包括以下任何一项:https,user,pass或port.如果您加载了http PECL扩展,则可以使用http_build_url完成此操作.否则,我必须使用parse_url手动粘合在一起.再次感谢乔治.
这里我的实现基于上面的例子/答案.
它是基于阶级的
使用卷曲
支持HTTP身份验证
跳过不属于基本域的URL
返回每页的Http标头响应代码
每页的返回时间
CRAWL CLASS:
class crawler { protected $_url; protected $_depth; protected $_host; protected $_useHttpAuth = false; protected $_user; protected $_pass; protected $_seen = array(); protected $_filter = array(); public function __construct($url, $depth = 5) { $this->_url = $url; $this->_depth = $depth; $parse = parse_url($url); $this->_host = $parse['host']; } protected function _processAnchors($content, $url, $depth) { $dom = new DOMDocument('1.0'); @$dom->loadHTML($content); $anchors = $dom->getElementsByTagName('a'); foreach ($anchors as $element) { $href = $element->getAttribute('href'); if (0 !== strpos($href, 'http')) { $path = '/' . ltrim($href, '/'); if (extension_loaded('http')) { $href = http_build_url($url, array('path' => $path)); } else { $parts = parse_url($url); $href = $parts['scheme'] . '://'; if (isset($parts['user']) && isset($parts['pass'])) { $href .= $parts['user'] . ':' . $parts['pass'] . '@'; } $href .= $parts['host']; if (isset($parts['port'])) { $href .= ':' . $parts['port']; } $href .= $path; } } // Crawl only link that belongs to the start domain $this->crawl_page($href, $depth - 1); } } protected function _getContent($url) { $handle = curl_init($url); if ($this->_useHttpAuth) { curl_setopt($handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY); curl_setopt($handle, CURLOPT_USERPWD, $this->_user . ":" . $this->_pass); } // follows 302 redirect, creates problem wiht authentication // curl_setopt($handle, CURLOPT_FOLLOWLOCATION, TRUE); // return the content curl_setopt($handle, CURLOPT_RETURNTRANSFER, TRUE); /* Get the HTML or whatever is linked in $url. */ $response = curl_exec($handle); // response total time $time = curl_getinfo($handle, CURLINFO_TOTAL_TIME); /* Check for 404 (file not found). */ $httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE); curl_close($handle); return array($response, $httpCode, $time); } protected function _printResult($url, $depth, $httpcode, $time) { ob_end_flush(); $currentDepth = $this->_depth - $depth; $count = count($this->_seen); echo "N::$count,CODE::$httpcode,TIME::$time,DEPTH::$currentDepth URL::$url
"; ob_start(); flush(); } protected function isValid($url, $depth) { if (strpos($url, $this->_host) === false || $depth === 0 || isset($this->_seen[$url]) ) { return false; } foreach ($this->_filter as $excludePath) { if (strpos($url, $excludePath) !== false) { return false; } } return true; } public function crawl_page($url, $depth) { if (!$this->isValid($url, $depth)) { return; } // add to the seen URL $this->_seen[$url] = true; // get Content and Return Code list($content, $httpcode, $time) = $this->_getContent($url); // print Result for current Page $this->_printResult($url, $depth, $httpcode, $time); // process subPages $this->_processAnchors($content, $url, $depth); } public function setHttpAuth($user, $pass) { $this->_useHttpAuth = true; $this->_user = $user; $this->_pass = $pass; } public function addFilterPath($path) { $this->_filter[] = $path; } public function run() { $this->crawl_page($this->_url, $this->_depth); } }
用法:
// USAGE $startURL = 'http://YOUR_URL/'; $depth = 6; $username = 'YOURUSER'; $password = 'YOURPASS'; $crawler = new crawler($startURL, $depth); $crawler->setHttpAuth($username, $password); // Exclude path with the following structure to be processed $crawler->addFilterPath('customer/account/login/referer'); $crawler->run();
看看PHP Crawler
http://sourceforge.net/projects/php-crawler/
看看它是否有帮助.
以它最简单的形式:
function crawl_page($url, $depth = 5) { if($depth > 0) { $html = file_get_contents($url); preg_match_all('~~', $html, $matches); foreach($matches[1] as $newurl) { crawl_page($newurl, $depth - 1); } file_put_contents('results.txt', $newurl."\n\n".$html."\n\n", FILE_APPEND); } } crawl_page('http://www.domain.com/index.php', 5);
该函数将从页面获取内容,然后抓取所有找到的链接并将内容保存到"results.txt".函数接受第二个参数depth,它定义了应该遵循链接的时间.如果您只想解析给定页面中的链接,请在那里传递1.
当可以使用wget时,为什么要为此使用PHP ,例如
wget -r -l 1 http://www.example.com
有关如何解析内容的信息,请参见解析HTML并使用search函数作为示例的最佳方法。之前已经多次回答过如何解析HTML的问题。
通过对hobodave代码的一些小改动,这里有一个可用于抓取页面的代码片段.这需要在服务器中启用curl扩展.
"); preg_match_all("/]*?href[\s]?=[\s\"\']+"."(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $stripped_file, $matches, PREG_SET_ORDER ); foreach($matches as $match){ $href = $match[1]; if (0 !== strpos($href, 'http')) { $path = '/' . ltrim($href, '/'); if (extension_loaded('http')) { $href = http_build_url($href , array('path' => $path)); } else { $parts = parse_url($href); $href = $parts['scheme'] . '://'; if (isset($parts['user']) && isset($parts['pass'])) { $href .= $parts['user'] . ':' . $parts['pass'] . '@'; } $href .= $parts['host']; if (isset($parts['port'])) { $href .= ':' . $parts['port']; } $href .= $path; } } crawl_page($href, $depth - 1); } } echo "Crawled {$href}"; } crawl_page("http://www.sitename.com/",3); ?>
我在此爬虫脚本教程中解释了本教程