PHPCrawl с simplehtmldom для анализа данных

Я пытаюсь использовать PHPCrawl для обхода и сбора URL-адресов, а затем отправьте на simplehtmldom, чтобы вытащить необходимые данные из html и сохранить в базе данных mysql. Сейчас я получаю сообщение об ошибкеPHPCrawl с simplehtmldom для анализа данных

Fatal error: Call to undefined method simple_html_dom::find() in /home/content/54/11109254/html/PHPCrawl_081/skunktest.php on line 44

Может кто-нибудь помочь с тем, что я сделал неправильно, и, возможно, взглянуть и увидеть, если у меня есть какие-либо другие препятствия впереди меня?

<?php 
set_time_limit(1000000); 

// Inculde the phpcrawl-mainclass 
include("libs/PHPCrawler.class.php"); 

// Include Simplehtmldom 
include("../simple_html_dom.php"); 

// Extend the class and override the handleDocumentInfo()-method 
class MyCrawler extends PHPCrawler 
{ 
    function handleDocumentInfo($DocInfo) 
    { 

    // Just detect linebreak for output ("\n" in CLI-mode, otherwise "<br>"). 
    if (PHP_SAPI == "cli") $lb = "\n"; 
    else $lb = "<br />"; 

    // Print the URL and the HTTP-status-Code 
    echo "Page requested: ".$DocInfo->url." (".$DocInfo->http_status_code.")".$lb; 

    // Print the refering URL 
    echo "Referer-page: ".$DocInfo->referer_url.$lb; 

    // Print if the content of the document was be recieved or not 
    if ($DocInfo->received == true) 
     echo "Content received: ".$DocInfo->bytes_received." bytes".$lb; 
    else 
     echo "Content not received".$lb; 

    // Now you should do something with the content of the actual 
    // received page or file ($DocInfo->source), we skip it in this example 

    $result = $DocInfo->url; 



    $html = file_get_html($result); 


    if($html && is_object($html) && isset($html->nodes)){ 

    $partnumber = $html->find('div[class=product-sku')->plaintext; 

    $title = $html->find('.product-name')->plaintext; 

    $productnote = $html->find('.product-note')->plaintext; 

    $description = $html->find('.product-description')->innertext; 


    foreach($html->find('.MagicZoomBigImageCont') as $img) 
     { 
      foreach($img->find('img') as $e) 
       { 
        $image = $e; 
        $imagehref = $e->href; 

       } 
     } 

    foreach($html->find('.p-related-image') as $rel) 
     { 
      foreach($rel->find('a') as $element) 
       { 
        $rel1 = $element[0]->href; 
        $rel2 = $element[1]->href; 
        $rel3 = $element[2]->href; 
        $rel4 = $element[3]->href; 
        $rel5 = $element[4]->href; 
       } 
     } 


    foreach($html->find('.p-related-name') as $name) 
     { 
      foreach($name->find('a') as $el) 
       { 
        $rel1n = $el[0]->plaintext; 
        $rel2n = $el[1]->plaintext; 
        $rel3n = $el[2]->plaintext; 
        $rel4n = $el[3]->plaintext; 
        $rel5n = $el[4]->plaintext; 
       } 
     } 

    $vehfitment = $html->find('div#appanel_1')->outertext; 
    }else{echo "htmldom issue";} 

    $manufacturer = "Skunk2"; 




//Make your connection to database 
$con = mysql_connect($host,$username,$password); 

//Check your connection 
if (!$con) { 
die("Could not connect: " . mysql_error()); 
} 

//Select your database 
$db_selected = mysql_select_db($database, $con); 

//Check to make sure the database is there 
if (!$db_selected) { 
    die ('Can\'t use the db : ' . mysql_error()); 
//} 

//Run query 
$result = mysql_query("INSERT INTO $table(manufacturer, partnumber, title, productnote, description, rel1img, rel2img, rel3img, rel4img, rel5img, rel1name, rel2name, rel3name, rel4name, rel5name, image, vehfitment) VALUES('".$manufacturer."','".$partnumber."','".$title."','".$productnote."','".$description."','".$rel1."','".$rel2."','".$rel3."','".$rel4."','".$rel5."','".$rel1n."','".$rel2n."','".$rel3n."','".$rel4n."','".$rel5n."','".$imagehref."','".$vehfitment."')"); 

echo '.$manufacturer.<br>.$partnumber.<br>.$title.<br>.$productnote.<br>.$description.<br>.$rel1.<br>.$rel1n.<br>.$image.<br>.$imagehref.<br>.$vehfitment.'; 

for($k=0;$k<count($image);$k++){ 

echo '<img src="'.$image[$k].'"><br/>'; 

$isok=copy($image[$k] , dirname(__FILE__).'/desktop/skunk2'.($k+1).'.jpg'); 

if(isok==true){ 
    echo' success!'; 
} 
else{ 
    echo ' Fail'; 
} 
} 

    echo $lb; 

    flush(); 
    } 
} 


// Now, create a instance of your class, define the behaviour 
// of the crawler (see class-reference for more options and details) 
// and start the crawling-process. 

$crawler = new MyCrawler(); 

// URL to crawl 
$crawler->setURL("store.skunk2.com"); 

// Store and send cookie-data like a browser does 
$crawler->enableCookieHandling(true); 

// Tell the crawler to stream everything but "text/html"-documents to a tmp-file 
$crawler->addStreamToFileContentType("#^((?!text/html).)*$#"); 

//User Agent String 

$crawler->setUserAgentString("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36"); 

//0 - The crawler will follow EVERY link, even if the link leads to a different host or domain. 
//If you choose this mode, you really should set a limit to the crawling-process (see limit-options), 
//otherwise the crawler maybe will crawl the whole WWW! 

//1 - The crawler only follow links that lead to the same domain like the one in the root-url. 
//E.g. if the root-url (setURL()) is "http://www.foo.com", the crawler will follow links to "http://www.foo.com/..." 
//and "http://bar.foo.com/...", but not to "http://www.another-domain.com/...". 

//2 - The crawler will only follow links that lead to the same host like the one in the root-url. 
//E.g. if the root-url (setURL()) is "http://www.foo.com", the crawler will ONLY follow links to "http://www.foo.com/...", but not 
//to "http://bar.foo.com/..." and "http://www.another-domain.com/...". This is the default mode. 

//3 - The crawler only follows links to pages or files located in or under the same path like the one of the root-url. 
//E.g. if the root-url is "http://www.foo.com/bar/index.html", the crawler will follow links to "http://www.foo.com/bar/page.html" and 
//"http://www.foo.com/bar/path/index.html", but not links to "http://www.foo.com/page.html" 
$crawler->setFollowMode(1); 

// Thats enough, now here we go 
$crawler->go(); 

// At the end, after the process is finished, we print a short 
// report (see method getProcessReport() for more information) 
$report = $crawler->getProcessReport(); 

if (PHP_SAPI == "cli") $lb = "\n"; 
else $lb = "<br />"; 

echo "Summary:".$lb; 
echo "Links followed: ".$report->links_followed.$lb; 
echo "Documents received: ".$report->files_received.$lb; 
echo "Bytes received: ".$report->bytes_received." bytes".$lb; 
echo "Process runtime: ".$report->process_runtime." sec".$lb; 
?>

источник

2013-12-19 partstaxi

Что вы получаете, когда вы звоните 'var_dump ($ HTML)' после '$ HTML = file_get_html ($ результат);'? –

Попробуйте

$html = new simple_html_dom(); 
$html->load_file($DocInfo->url;); 

if($html && is_object($html) && isset($html->nodes)){ 
... 
}

источник

2013-12-19 20:38:38 alkis

Спасибо, ребята. Похоже, у меня был путь включения, который был запутан для файла simple_html_dom.php. Теперь скрипт willbegin для запуска, но я получаю ошибку - ERR_CONTENT_DECODING_FAILED - Любые мысли? – partstaxi

Что-то кажется сломанным с заголовками сайта. Проверьте тип содержимого и проверьте, можете ли вы изменить заголовки перед продолжением. – alkis

В чем была проблема? – alkis

Используйте следующую ссылку для запуска коды плавно ссылка исходного кода проверяется.

<?php 
 
set_time_limit(10000); 
 
// Inculde the phpcrawl-mainclass 
 
include("libs/PHPCrawler.class.php"); 
 
// Extend the class and override the handleDocumentInfo()-method 
 
class MyCrawler extends PHPCrawler 
 
{ 
 
function handleDocumentInfo($DocInfo) 
 
{ 
 
// Just detect linebreak for output ("\n" in CLI-mode, otherwise "<br>"). 
 
if (PHP_SAPI == "cli") $lb = "\n"; 
 
else $lb = "<br />"; 
 
// Print the URL and the HTTP-status-Code 
 
echo "Page requested: ".$DocInfo->url." (".$DocInfo->http_status_code.")".$lb; 
 
// Print the refering URL 
 
echo "Referer-page: ".$DocInfo->referer_url.$lb; 
 
// Print if the content of the document was be recieved or not 
 
if ($DocInfo->received == true) 
 
echo "Content received: ".$DocInfo->bytes_received." bytes".$lb; 
 
else 
 
echo "Content not received".$lb; 
 
// Now you should do something with the content of the actual 
 
// received page or file ($DocInfo->source), we skip it in this example 
 
echo $lb; 
 
flush(); 
 
} 
 
} 
 
// Now, create a instance of your class, define the behaviour 
 
// of the crawler (see class-reference for more options and details) 
 
// and start the crawling-process. 
 
$crawler = new MyCrawler(); 
 
// URL to crawl 
 
$crawler->setURL("www.php.net"); 
 
// Only receive content of files with content-type "text/html" 
 
$crawler->addContentTypeReceiveRule("#text/html#"); 
 
// Ignore links to pictures, dont even request pictures 
 
$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i"); 
 
// Store and send cookie-data like a browser does 
 
$crawler->enableCookieHandling(true); 
 
// Set the traffic-limit to 1 MB (in bytes, 
 
// for testing we dont want to "suck" the whole site) 
 
$crawler->setTrafficLimit(1000 * 1024); 
 
// Thats enough, now here we go 
 
$crawler->go(); 
 
// At the end, after the process is finished, we print a short 
 
// report (see method getProcessReport() for more information) 
 
$report = $crawler->getProcessReport(); 
 
if (PHP_SAPI == "cli") $lb = "\n"; 
 
else $lb = "<br />"; 
 
echo "Summary:".$lb; 
 
echo "Links followed: ".$report->links_followed.$lb; 
 
echo "Documents received: ".$report->files_received.$lb; 
 
echo "Bytes received: ".$report->bytes_received." bytes".$lb; 
 
echo "Process runtime: ".$report->process_runtime." sec".$lb; 
 
?>

http://quandaflow.com/php-web-crawler/ http://phpcrawl.cuab.de/example.html

источник

2016-06-13 20:09:57

PHPCrawl с simplehtmldom для анализа данных

ответ

Смежные вопросы