Webスクレイピング手法 preg_match vs. simple_html_dom
March 14, 2016 – 11:57 am公開Webから情報を抽出するWebスクレイピングについてさまざま議論されている。このブログサイトでも、Yahoo Financeから為替とか株価データを抽出する方法を試してきた。そこでは、php関数 preg_match_all()を用いた正規表現マッチによる。正規表現マッチ以外にも、DOM解析による方法などがあるようだ。
ここでは、日経電子版のフロントページ(http://www.nikkei.com/)に掲載される主な市場指標から表示値の取得をターゲットとし、preg_match_all.phpとDOM解析を基本とするsimple_html_dom.phpの二つのツールを用いてスクレイピング法を比較してみた。
結論めいたことを言うと、原始的で泥臭い手法ではあるが、私にとっては、正規表現マッチによる方法が小回りが利いて良さそうだ。
preg_match_allを使った値の取得:
test_preg_match.php ソース:
<?php $url ="http://www.nikkei.com/"; // $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); $feed = curl_exec($ch); curl_close($ch); // $feed = str_replace(array("\r\n","\r","\n"), '', $feed); $n = preg_match_all("/\"a-vam\">([^<]+)<span[^<]+_unit\">\(([^<]*)\)<\/span>/", $feed, $cell, PREG_PATTERN_ORDER); print_r($cell); $n = preg_match_all("/<span class=\"m-market_exchange_date\">([^<]*)<\/span>/", $feed, $cell, PREG_PATTERN_ORDER); print_r($cell); $n = preg_match_all("/<td class=\"m-market_rate\">([^<]*)<\/td>/", $feed, $cell, PREG_PATTERN_ORDER); print_r($cell); $n = preg_match_all("/<td class=\"m-market_before[^<]+\">([^<]*)<\/td>/", $feed, $cell, PREG_PATTERN_ORDER); print_r($cell); $n = preg_match_all("/<td class=\"m-market_percent[^<]+\">([^<]*)<\/td>/", $feed, $cell, PREG_PATTERN_ORDER); print_r($cell); ?>
test_preg_match.php 出力例:
$ php test_preg_match.php Array ( [0] => Array ( [0] => "a-vam">日経平均<span class="m-market_exchange_unit">(円)</span> [1] => "a-vam">NYダウ<span class="m-market_exchange_unit">(ドル)</span> [2] => "a-vam">ドル<span class="m-market_exchange_unit">(円)</span> [3] => "a-vam">ユーロ<span class="m-market_exchange_unit">(円)</span> [4] => "a-vam">長期金利<span class="m-market_exchange_unit">(%)</span> [5] => "a-vam">NY原油<span class="m-market_exchange_unit">(ドル)</span> ) [1] => Array ( [0] => 日経平均 [1] => NYダウ [2] => ドル [3] => ユーロ [4] => 長期金利 [5] => NY原油 ) [2] => Array ( [0] => 円 [1] => ドル [2] => 円 [3] => 円 [4] => % [5] => ドル ) ) Array ( [0] => Array ( [0] => <span class="m-market_exchange_date">3/14 9:43</span> [1] => <span class="m-market_exchange_date">3/11 16:47</span> [2] => <span class="m-market_exchange_date">3/14 9:23</span> [3] => <span class="m-market_exchange_date">3/14 9:23</span> [4] => <span class="m-market_exchange_date">3/11 17:48</span> [5] => <span class="m-market_exchange_date">3/11 終値</span> ) [1] => Array ( [0] => 3/14 9:43 [1] => 3/11 16:47 [2] => 3/14 9:23 [3] => 3/14 9:23 [4] => 3/11 17:48 [5] => 3/11 終値 ) ) Array ( [0] => Array ( [0] => <td class="m-market_rate">17,167.37</td> [1] => <td class="m-market_rate">17,213.31</td> [2] => <td class="m-market_rate">113.71-76</td> [3] => <td class="m-market_rate">126.92-94</td> [4] => <td class="m-market_rate">-0.015</td> [5] => <td class="m-market_rate">38.50</td> ) [1] => Array ( [0] => 17,167.37 [1] => 17,213.31 [2] => 113.71-76 [3] => 126.92-94 [4] => -0.015 [5] => 38.50 ) ) Array ( [0] => Array ( [0] => <td class="m-market_before plus">+228.50</td> [1] => <td class="m-market_before plus">+218.18</td> [2] => <td class="m-market_before ">±0.00</td> [3] => <td class="m-market_before plus">+0.20円安</td> [4] => <td class="m-market_before plus">+0.010</td> [5] => <td class="m-market_before plus">+0.66</td> ) [1] => Array ( [0] => +228.50 [1] => +218.18 [2] => ±0.00 [3] => +0.20円安 [4] => +0.010 [5] => +0.66 ) ) Array ( [0] => Array ( [0] => <td class="m-market_percent plus">+1.35%</td> [1] => <td class="m-market_percent plus">+1.28%</td> [2] => <td class="m-market_percent ">±0.00%</td> [3] => <td class="m-market_percent plus">+0.16%</td> [4] => <td class="m-market_percent "></td> [5] => <td class="m-market_percent plus">+1.74%</td> ) [1] => Array ( [0] => +1.35% [1] => +1.28% [2] => ±0.00% [3] => +0.16% [4] => [5] => +1.74% ) )
simple_html_dom を活用した値の取得
test_html_dom.phpソース:
<?php include "simple_html_dom.php"; $url ="http://www.nikkei.com/"; // $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); $feed = curl_exec($ch); curl_close($ch); // $feed = str_replace(array("\r\n","\r","\n"), '', $feed); $html = new simple_html_dom(); $html->load( $feed ); $feed_table_tag = ""; foreach($html->find("table") as $element){ $feed_table_tag .= $element->outertext; } $html->load( $feed_table_tag ); $array01 = $html->find("span.a-vam"); $array02 = $html->find("span.m-market_exchange_date"); $cell = array(); foreach( $html->find("span.a-vam") as $element) { // echo $element->plaintext . "\n"; $cell[0][] = $element->outertext; $cell[1][] = $element->innertext; $cell[2][] = $element->plaintext; } print_r($cell); $cell = array(); foreach( $html->find("span.m-market_exchange_date") as $element) { // echo $element->outertext . "\n"; $cell[0][] = $element->outertext; $cell[1][] = $element->innertext; } print_r($cell); $cell = array(); foreach( $html->find("td.m-market_rate") as $element) { // echo $element->outertext . "\n"; $cell[0][] = $element->outertext; $cell[1][] = $element->innertext; } print_r($cell); $cell = array(); foreach( $html->find("td.m-market_rate") as $element) { // echo $element->outertext . "\n"; $cell[0][] = $element->outertext; $cell[1][] = $element->innertext; } print_r($cell); $cell = array(); foreach( $html->find("td.m-market_before") as $element ) { // echo $element->outertext . "\n"; $cell[0][] = $element->outertext; $cell[1][] = $element->innertext; } print_r($cell); $cell = array(); foreach( $html->find("td.m-market_percent") as $element ) { // echo $element->outertext . "\n"; $cell[0][] = $element->outertext; if ( !$element->innertext ) $element->innertext = " "; $cell[1][] = $element->innertext; } print_r($cell); ?>
test_html_dom.php 出力例
$ php test_html_dom.php Array ( [0] => Array ( [0] => <span class="a-vam">日経平均<span class="m-market_exchange_unit">(円)</span></span> [1] => <span class="a-vam">NYダウ<span class="m-market_exchange_unit">(ドル)</span></span> [2] => <span class="a-vam">ドル<span class="m-market_exchange_unit">(円)</span></span> [3] => <span class="a-vam">ユーロ<span class="m-market_exchange_unit">(円)</span></span> [4] => <span class="a-vam">長期金利<span class="m-market_exchange_unit">(%)</span></span> [5] => <span class="a-vam">NY原油<span class="m-market_exchange_unit">(ドル)</span></span> ) [1] => Array ( [0] => 日経平均<span class="m-market_exchange_unit">(円)</span> [1] => NYダウ<span class="m-market_exchange_unit">(ドル)</span> [2] => ドル<span class="m-market_exchange_unit">(円)</span> [3] => ユーロ<span class="m-market_exchange_unit">(円)</span> [4] => 長期金利<span class="m-market_exchange_unit">(%)</span> [5] => NY原油<span class="m-market_exchange_unit">(ドル)</span> ) [2] => Array ( [0] => 日経平均(円) [1] => NYダウ(ドル) [2] => ドル(円) [3] => ユーロ(円) [4] => 長期金利(%) [5] => NY原油(ドル) ) ) Array ( [0] => Array ( [0] => <span class="m-market_exchange_date">3/14 9:37</span> [1] => <span class="m-market_exchange_date">3/11 16:47</span> [2] => <span class="m-market_exchange_date">3/14 9:17</span> [3] => <span class="m-market_exchange_date">3/14 9:17</span> [4] => <span class="m-market_exchange_date">3/11 17:48</span> [5] => <span class="m-market_exchange_date">3/11 終値</span> ) [1] => Array ( [0] => 3/14 9:37 [1] => 3/11 16:47 [2] => 3/14 9:17 [3] => 3/14 9:17 [4] => 3/11 17:48 [5] => 3/11 終値 ) ) Array ( [0] => Array ( [0] => <td class="m-market_rate">17,155.01</td> [1] => <td class="m-market_rate">17,213.31</td> [2] => <td class="m-market_rate">113.74-76</td> [3] => <td class="m-market_rate">126.95-98</td> [4] => <td class="m-market_rate">-0.015</td> [5] => <td class="m-market_rate">38.50</td> ) [1] => Array ( [0] => 17,155.01 [1] => 17,213.31 [2] => 113.74-76 [3] => 126.95-98 [4] => -0.015 [5] => 38.50 ) ) Array ( [0] => Array ( [0] => <td class="m-market_rate">17,155.01</td> [1] => <td class="m-market_rate">17,213.31</td> [2] => <td class="m-market_rate">113.74-76</td> [3] => <td class="m-market_rate">126.95-98</td> [4] => <td class="m-market_rate">-0.015</td> [5] => <td class="m-market_rate">38.50</td> ) [1] => Array ( [0] => 17,155.01 [1] => 17,213.31 [2] => 113.74-76 [3] => 126.95-98 [4] => -0.015 [5] => 38.50 ) ) Array ( [0] => Array ( [0] => <td class="m-market_before plus">+216.14</td> [1] => <td class="m-market_before plus">+218.18</td> [2] => <td class="m-market_before plus">+0.03円安</td> [3] => <td class="m-market_before plus">+0.23円安</td> [4] => <td class="m-market_before plus">+0.010</td> [5] => <td class="m-market_before plus">+0.66</td> ) [1] => Array ( [0] => +216.14 [1] => +218.18 [2] => +0.03円安 [3] => +0.23円安 [4] => +0.010 [5] => +0.66 ) ) Array ( [0] => Array ( [0] => <td class="m-market_percent plus">+1.28%</td> [1] => <td class="m-market_percent plus">+1.28%</td> [2] => <td class="m-market_percent plus">+0.03%</td> [3] => <td class="m-market_percent plus">+0.18%</td> [4] => <td class="m-market_percent"></td> [5] => <td class="m-market_percent plus">+1.74%</td> ) [1] => Array ( [0] => +1.28% [1] => +1.28% [2] => +0.03% [3] => +0.18% [4] => [5] => +1.74% ) )