Webスクレイピング手法 preg_match vs. simple_html_dom

March 14, 2016 – 11:57 am

公開Webから情報を抽出するWebスクレイピングについてさまざま議論されている。このブログサイトでも、Yahoo Financeから為替とか株価データを抽出する方法を試してきた。そこでは、php関数 preg_match_all()を用いた正規表現マッチによる。正規表現マッチ以外にも、DOM解析による方法などがあるようだ。

ここでは、日経電子版のフロントページ(http://www.nikkei.com/)に掲載される主な市場指標から表示値の取得をターゲットとし、preg_match_all.phpとDOM解析を基本とするsimple_html_dom.phpの二つのツールを用いてスクレイピング法を比較してみた。

結論めいたことを言うと、原始的で泥臭い手法ではあるが、私にとっては、正規表現マッチによる方法が小回りが利いて良さそうだ。

preg_match_allを使った値の取得

test_preg_match.php ソース:


<?php

    $url ="http://www.nikkei.com/"; 
//
    $ch  = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    $feed = curl_exec($ch);
    curl_close($ch);
//

$feed = str_replace(array("\r\n","\r","\n"), '', $feed);

$n = preg_match_all("/\"a-vam\">([^<]+)<span[^<]+_unit\">\(([^<]*)\)<\/span>/", $feed, $cell, PREG_PATTERN_ORDER);
print_r($cell);
$n = preg_match_all("/<span class=\"m-market_exchange_date\">([^<]*)<\/span>/", $feed, $cell, PREG_PATTERN_ORDER);
print_r($cell);
$n = preg_match_all("/<td class=\"m-market_rate\">([^<]*)<\/td>/", $feed, $cell, PREG_PATTERN_ORDER);
print_r($cell);
$n = preg_match_all("/<td class=\"m-market_before[^<]+\">([^<]*)<\/td>/", $feed, $cell, PREG_PATTERN_ORDER);
print_r($cell);
$n = preg_match_all("/<td class=\"m-market_percent[^<]+\">([^<]*)<\/td>/", $feed, $cell, PREG_PATTERN_ORDER);
print_r($cell); 

?>

    

test_preg_match.php 出力例:

$ php test_preg_match.php
                                                                                   Array
(
    [0] => Array
        (
            [0] => "a-vam">日経平均<span class="m-market_exchange_unit">(円)</span>
            [1] => "a-vam">NYダウ<span class="m-market_exchange_unit">(ドル)</span>
            [2] => "a-vam">ドル<span class="m-market_exchange_unit">(円)</span>
            [3] => "a-vam">ユーロ<span class="m-market_exchange_unit">(円)</span>
            [4] => "a-vam">長期金利<span class="m-market_exchange_unit">(%)</span>
            [5] => "a-vam">NY原油<span class="m-market_exchange_unit">(ドル)</span>
        )

    [1] => Array
        (
            [0] => 日経平均
            [1] => NYダウ
            [2] => ドル
            [3] => ユーロ
            [4] => 長期金利
            [5] => NY原油
        )

    [2] => Array
        (
            [0] => 円
            [1] => ドル
            [2] => 円
            [3] => 円
            [4] => %
            [5] => ドル
        )

)
Array
(
    [0] => Array
        (
            [0] => <span class="m-market_exchange_date">3/14 9:43</span>
            [1] => <span class="m-market_exchange_date">3/11 16:47</span>
            [2] => <span class="m-market_exchange_date">3/14 9:23</span>
            [3] => <span class="m-market_exchange_date">3/14 9:23</span>
            [4] => <span class="m-market_exchange_date">3/11 17:48</span>
            [5] => <span class="m-market_exchange_date">3/11 終値</span>
        )

    [1] => Array
        (
            [0] => 3/14 9:43
            [1] => 3/11 16:47
            [2] => 3/14 9:23
            [3] => 3/14 9:23
            [4] => 3/11 17:48
            [5] => 3/11 終値
        )

)
Array
(
    [0] => Array
        (
            [0] => <td class="m-market_rate">17,167.37</td>
            [1] => <td class="m-market_rate">17,213.31</td>
            [2] => <td class="m-market_rate">113.71-76</td>
            [3] => <td class="m-market_rate">126.92-94</td>
            [4] => <td class="m-market_rate">-0.015</td>
            [5] => <td class="m-market_rate">38.50</td>
        )

    [1] => Array
        (
            [0] => 17,167.37
            [1] => 17,213.31
            [2] => 113.71-76
            [3] => 126.92-94
            [4] => -0.015
            [5] => 38.50
        )

)
Array
(
    [0] => Array
        (
            [0] => <td class="m-market_before plus">+228.50</td>
            [1] => <td class="m-market_before plus">+218.18</td>
            [2] => <td class="m-market_before ">±0.00</td>
            [3] => <td class="m-market_before plus">+0.20円安</td>
            [4] => <td class="m-market_before plus">+0.010</td>
            [5] => <td class="m-market_before plus">+0.66</td>
        )

    [1] => Array
        (
            [0] => +228.50
            [1] => +218.18
            [2] => ±0.00
            [3] => +0.20円安
            [4] => +0.010
            [5] => +0.66
        )

)
Array
(
    [0] => Array
        (
            [0] => <td class="m-market_percent plus">+1.35%</td>
            [1] => <td class="m-market_percent plus">+1.28%</td>
            [2] => <td class="m-market_percent ">±0.00%</td>
            [3] => <td class="m-market_percent plus">+0.16%</td>
            [4] => <td class="m-market_percent "></td>
            [5] => <td class="m-market_percent plus">+1.74%</td>
        )

    [1] => Array
        (
            [0] => +1.35%
            [1] => +1.28%
            [2] => ±0.00%
            [3] => +0.16%
            [4] => 
            [5] => +1.74%
        )

)

   

simple_html_dom を活用した値の取得

test_html_dom.phpソース:

<?php
include "simple_html_dom.php";
    $url ="http://www.nikkei.com/";
//
    $ch  = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    $feed = curl_exec($ch);
    curl_close($ch);
//

$feed = str_replace(array("\r\n","\r","\n"), '', $feed);

$html = new simple_html_dom();
$html->load( $feed );

$feed_table_tag = "";
foreach($html->find("table") as $element){
   $feed_table_tag  .= $element->outertext;
}

$html->load( $feed_table_tag );
$array01 = $html->find("span.a-vam");
$array02 = $html->find("span.m-market_exchange_date"); 

$cell = array();
foreach( $html->find("span.a-vam") as $element) {
//   echo $element->plaintext . "\n";
     $cell[0][] = $element->outertext;
     $cell[1][] = $element->innertext;
     $cell[2][] = $element->plaintext;
}
print_r($cell);

$cell = array();
foreach( $html->find("span.m-market_exchange_date") as $element) {
//    echo $element->outertext . "\n";
    $cell[0][] = $element->outertext;
    $cell[1][] = $element->innertext;
}
print_r($cell);

$cell = array();
foreach( $html->find("td.m-market_rate") as $element) {
//    echo $element->outertext . "\n";
    $cell[0][] = $element->outertext;
    $cell[1][] = $element->innertext;
}
print_r($cell);

$cell = array();
foreach( $html->find("td.m-market_rate") as $element) {
//    echo $element->outertext . "\n";
    $cell[0][] = $element->outertext;
    $cell[1][] = $element->innertext;
}
print_r($cell);

$cell = array();
foreach( $html->find("td.m-market_before") as $element ) {
//         echo $element->outertext . "\n";
    $cell[0][] = $element->outertext;
    $cell[1][] = $element->innertext;
}
print_r($cell);

$cell = array();
foreach( $html->find("td.m-market_percent") as $element ) {
//         echo $element->outertext . "\n";
    $cell[0][] = $element->outertext;
    if ( !$element->innertext ) $element->innertext = " ";
    $cell[1][] = $element->innertext;
}
print_r($cell);

?>

   

test_html_dom.php 出力例

$ php test_html_dom.php
Array
(
    [0] => Array
        (
            [0] => <span class="a-vam">日経平均<span class="m-market_exchange_unit">(円)</span></span>
            [1] => <span class="a-vam">NYダウ<span class="m-market_exchange_unit">(ドル)</span></span>
            [2] => <span class="a-vam">ドル<span class="m-market_exchange_unit">(円)</span></span>
            [3] => <span class="a-vam">ユーロ<span class="m-market_exchange_unit">(円)</span></span>
            [4] => <span class="a-vam">長期金利<span class="m-market_exchange_unit">(%)</span></span>
            [5] => <span class="a-vam">NY原油<span class="m-market_exchange_unit">(ドル)</span></span>
        )

    [1] => Array
        (
            [0] => 日経平均<span class="m-market_exchange_unit">(円)</span>
            [1] => NYダウ<span class="m-market_exchange_unit">(ドル)</span>
            [2] => ドル<span class="m-market_exchange_unit">(円)</span>
            [3] => ユーロ<span class="m-market_exchange_unit">(円)</span>
            [4] => 長期金利<span class="m-market_exchange_unit">(%)</span>
            [5] => NY原油<span class="m-market_exchange_unit">(ドル)</span>
        )

    [2] => Array
        (
            [0] => 日経平均(円)  
            [1] => NYダウ(ドル)  
            [2] => ドル(円)  
            [3] => ユーロ(円)  
            [4] => 長期金利(%)  
            [5] => NY原油(ドル)  
        )

)
Array
(
    [0] => Array
        (
            [0] => <span class="m-market_exchange_date">3/14 9:37</span>
            [1] => <span class="m-market_exchange_date">3/11 16:47</span>
            [2] => <span class="m-market_exchange_date">3/14 9:17</span>
            [3] => <span class="m-market_exchange_date">3/14 9:17</span>
            [4] => <span class="m-market_exchange_date">3/11 17:48</span>
            [5] => <span class="m-market_exchange_date">3/11 終値</span>
        )

    [1] => Array
        (
            [0] => 3/14 9:37
            [1] => 3/11 16:47
            [2] => 3/14 9:17
            [3] => 3/14 9:17
            [4] => 3/11 17:48
            [5] => 3/11 終値
        )

)
Array
(
    [0] => Array
        (
            [0] => <td class="m-market_rate">17,155.01</td>
            [1] => <td class="m-market_rate">17,213.31</td>
            [2] => <td class="m-market_rate">113.74-76</td>
            [3] => <td class="m-market_rate">126.95-98</td>
            [4] => <td class="m-market_rate">-0.015</td>
            [5] => <td class="m-market_rate">38.50</td>
        )

    [1] => Array
        (
            [0] => 17,155.01
            [1] => 17,213.31
            [2] => 113.74-76
            [3] => 126.95-98
            [4] => -0.015
            [5] => 38.50
        )

)
Array
(
    [0] => Array
        (
            [0] => <td class="m-market_rate">17,155.01</td>
            [1] => <td class="m-market_rate">17,213.31</td>
            [2] => <td class="m-market_rate">113.74-76</td>
            [3] => <td class="m-market_rate">126.95-98</td>
            [4] => <td class="m-market_rate">-0.015</td>
            [5] => <td class="m-market_rate">38.50</td>
        )

    [1] => Array
        (
            [0] => 17,155.01
            [1] => 17,213.31
            [2] => 113.74-76
            [3] => 126.95-98
            [4] => -0.015
            [5] => 38.50
        )

)
Array
(
    [0] => Array
        (
            [0] => <td class="m-market_before plus">+216.14</td>
            [1] => <td class="m-market_before plus">+218.18</td>
            [2] => <td class="m-market_before plus">+0.03円安</td>
            [3] => <td class="m-market_before plus">+0.23円安</td>
            [4] => <td class="m-market_before plus">+0.010</td>
            [5] => <td class="m-market_before plus">+0.66</td>
        )

    [1] => Array
        (
            [0] => +216.14
            [1] => +218.18
            [2] => +0.03円安
            [3] => +0.23円安
            [4] => +0.010
            [5] => +0.66
        )

)
Array
(
    [0] => Array
        (
            [0] => <td class="m-market_percent plus">+1.28%</td>
            [1] => <td class="m-market_percent plus">+1.28%</td>
            [2] => <td class="m-market_percent plus">+0.03%</td>
            [3] => <td class="m-market_percent plus">+0.18%</td>
            [4] => <td class="m-market_percent"></td>
            [5] => <td class="m-market_percent plus">+1.74%</td>
        )

    [1] => Array
        (
            [0] => +1.28%
            [1] => +1.28%
            [2] => +0.03%
            [3] => +0.18%
            [4] =>  
            [5] => +1.74%
        )

)

  
  


Post a Comment