序言
這是我依VB.Net的網頁原始碼處理函式翻過來的PHP版程式,可以取得網頁後,再針對特定標簽進行切割處理。函式原始碼
<?php
Class HTMLParser{
function getHTML($url,$method="GET",$param=NULL,$noCashe=False){
//$param = array("name" => 'tim',"content" => 'test');
if($noCashe){
$cashe="Cache-Control: no-cache\r\n";
}else{
$cashe="";
}
if($method=="GET" and $param!=NULL){
$data="";
foreach ($param as $k => $v) {
if($data!="") $data.="&";
$data.= "$k=".htmlentities($v);
}
$url.="?".$data;
}
if($method=="POST" and $param!=NULL){
$data = http_build_query($param);
$contentLength="Content-length:".strlen($data)."\r\n";
$opts = array(
'http'=>array(
'method'=>$method,
'header'=>"User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)\r\n"
. "Accept-language: zh-tw\r\n"
. "Content-Typ: application/x-www-form-urlencoded\r\n"
. $cashe . $contentLength,
'content' => $data
)
);
}else{
$opts = array(
'http'=>array(
'method'=>$method,
'header'=>"User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)\r\n"
. "Accept-language: zh-tw\r\n"
. "Content-Typ: application/x-www-form-urlencoded\r\n"
. $cashe
)
);
}
$context = stream_context_create($opts);
// Open the file using the HTTP headers set above
return file_get_contents($url, false, $context);
//return file_get_contents($url);
}
//取得網頁的Body區段
function getHTMLBody($url,$postValue=NULL,$noCashe= False){
$html=NULL;
$htmlBody=NULL;
if($postValue!=NULL){
$html = $this->getHTML($url,"POST",$postValue,$noCashe);
}else{
$html = $this->getHTML($url,"GET", $postValue,$noCashe);
}
If(html!=NULL) $this->getHTMLTagContain($html, "body", $htmlBody);
return $htmlBody;
}
//取得HTML中第一個符合的標籤內容的指標,並將取得的標籤內容寫入參數中
function getHTMLTagContain($html,$tag,&$contain=NULL,&$indexEnd= -1){
$tag = strtolower($tag);
$contain = NULL;
$indexBegin= -1;
$indexEnd = -1;
$indexbBegin= -1;
If($html !=NULL){
$indexBegin = strpos(strtolower($html),"<" . $tag);
if (gettype($indexBegin)!="integer") $indexBegin=-1;
If($indexBegin > -1){
$indexbBegin = strpos($html,">",$indexBegin);
if (gettype($indexbBegin)!="integer"){
$indexbBegin=-1;
}else{
$indexbBegin+=1;
}
$findTag=False;
$indexEnd=$indexbBegin;
$lastStart=$indexbBegin;
$stopLimit2 = 9999;
do{
$indexEnd=strpos(strtolower($html),"</" . $tag,$indexEnd);
if(gettype($indexEnd)!="integer") $indexEnd=-1;
If ($indexEnd > -1){
if(gettype(strpos(substr($html,$lastStart, $indexEnd - $lastStart),"<" . $tag))=="integer"){
$lastStart=$indexEnd;
$indexEnd=$indexEnd+strlen("</" . $tag);
$findTag=True;
}else{
$findTag=False;
}
}else{
$findTag=False;
}
$stopLimit2--;
}while($findTag AND $stopLimit2>0);
If ($indexEnd > -1){
$contain = substr($html,$indexbBegin, $indexEnd - $indexbBegin);
$indexEnd= strpos($html,">",$indexEnd);
if(gettype($indexEnd)!="integer"){
$indexEnd=-1;
}else{
$indexEnd+=1;
}
}
}
}
//echo $contain;
Return $indexbBegin;
}
//取得HTML中第一個符合的標籤屬性的指標,並將取得的標籤內容與該屬性內容寫入參數中
function getHTMLTagAtt($html,$tag,$attName,&$att=NULL,&$contain=NULL,&$indexEnd= -1){
$tag = strtolower($tag);
$attName = strtolower($attName);
$contain = NULL;
$att = NULL;
$indexEnd = -1;
$indexBegin= -1;
$indexaBegin= -1;
$indexbEnd = -1;
If($html !=NULL){
$indexBegin = 0;
$stopLimit1 = 9999;
do{
$indexbEnd = -1;
$indexBegin = strpos(strtolower($html),"<" . $tag,$indexBegin);
if (gettype($indexBegin)!="integer") $indexBegin=-1;
If($indexBegin > -1){
$indexbEnd= strpos($html,">",$indexBegin);
if (gettype($indexbEnd)!="integer") $indexbEnd=-1;
If($indexbEnd > -1){
$indexaBegin = strpos(strtolower(str_replace("\"", "'",substr($html,0, $indexbEnd))),$attName . "='",$indexBegin);
if(gettype($indexaBegin)!="integer") $indexaBegin=-1;
If($indexaBegin > -1){
$sign = substr($html,$indexaBegin + strlen($attName . "="), 1);
$indexaBegin+=strlen($attName . "='");
$indexaEnd= strpos(substr($html,0, $indexbEnd),$sign,$indexaBegin);
if(gettype($indexaEnd)!="integer") $indexaEnd=-1;
If($indexaEnd > -1){
$att = substr($html,$indexaBegin, $indexaEnd - $indexaBegin);
}
}else{
$indexBegin = $indexbEnd + 1;
continue;
}
$indexbEnd+=1;
}
$findTag=False;
$indexEnd=$indexbEnd;
$lastStart=$indexbEnd;
$stopLimit2 = 9999;
do{
$indexEnd=strpos(strtolower($html),"</" . $tag,$indexEnd);
if(gettype($indexEnd)!="integer") $indexEnd=-1;
If ($indexEnd > -1){
if(gettype(strpos(substr($html,$lastStart, $indexEnd - $lastStart),"<" . $tag))=="integer"){
$lastStart=$indexEnd;
$indexEnd=$indexEnd+strlen("</" . $tag);
$findTag=True;
}else{
$findTag=False;
}
}else{
$findTag=False;
}
$stopLimit2--;
}while($findTag==True AND $stopLimit2>0);
If ($indexEnd > -1){
$contain = substr($html,$indexbEnd, $indexEnd - $indexbEnd);
$indexEnd= strpos($html,">",$indexEnd);
if(gettype($indexEnd)!="integer"){
$indexEnd=-1;
}else{
$indexEnd+=1;
}
}
}else{
break;
}
$stopLimit1--;
}while($att == NULL AND $stopLimit1>0);
}
Return $indexbEnd;
}
}
?>
使用範例
<?php
include 'myClass/HTMLParser.php';
$htmlParser=new HTMLParser();
$url = "http://allen080.blogspot.com/2009/05/vbnet20.html";
//取得整個網頁的HTML
$htmlAll = $htmlParser->getHTML($url);
//取得網頁Body的部份
$htmlBody = $htmlParser->getHTMLBody($url);
If ($htmlBody!=NULL){
$indexEnd = strpos($htmlBody,"<span class='item-control blog-admin'>");
if(gettype($indexEnd)!="integer") $indexEnd=-1;
$valueAttribute = NULL;
$valueContain = NULL;
If($indexEnd > 0){
$htmlBody = substr($htmlBody,$indexEnd);
//取得a標籤1的內容
$htmlParser->getHTMLTagContain($htmlBody, "span", $valueContain, $indexEnd);
echo "span標籤的內容: " . $valueContain ."<BR>\n";
//$htmlBody = substr($htmlBody,$indexEnd);
//取得a標籤2的內容與href的屬性
$htmlParser->getHTMLTagAtt($htmlBody, "a", "onclick", $valueAttribute, $valueContain, $indexEnd);
echo "a標籤的內容與onclick的屬性: " . $valueAttribute ."\t" . ",內容:" . $valueContain;
}
}
?>
執行結果
span標籤的內容:
<a class='quickedit' href='http://www.blogger.com/rearrange?blogID=2698062899592178296&widgetType=HTML&widgetId=HTML1&action=editWidget' onclick='return _WidgetManager._PopupConfig(document.getElementById("HTML1"));' target='configHTML1' title='編輯'>
<img alt='' height='18' src='http://img1.blogblog.com/img/icon18_wrench_allbkg.png' width='18'/>
</a>
<BR>
a標籤的內容與onclick的屬性: return _WidgetManager._PopupConfig(document.getElementById("HTML1")); ,內容:
<img alt='' height='18' src='http://img1.blogblog.com/img/icon18_wrench_allbkg.png' width='18'/>
沒有留言:
張貼留言