序言
我因為學術需求,要取得某些網址的網頁原始碼進行分析,因此寫了以下函式,貼出來與大家分享吧~
PS.我實際測試下有的網頁用HttpRequest取的到正確的結果,有的則用WebClient才取的到,所以這個函式可能不夠通用,不過還是可以做為參考
開發環境
- VB.Net 2.0 (VS2005)
專案設定
網頁原始碼函式類別
Imports System.Net
Imports System.io
''' <summary>
''' 網頁原始碼取得函式
''' </summary>
''' <remarks>網頁原始碼取得函式By allen080.blogspot.com</remarks>
Public Class WebPageGenFunc
''' <summary>
''' 使用Get方法取得網頁內容
''' </summary>
''' <param name="url">網址</param>
''' <param name="noCashe">不使用快取</param>
''' <returns>網頁的HTML</returns>
''' <remarks>使用Get方法取得網頁內容By allen080.blogspot.com</remarks>
Public Shared Function getHTMLGet(ByVal url As String, Optional ByVal noCashe As Boolean = False) As String
getHTMLGet = Nothing
Dim wRs As HttpWebResponse
Dim wRq As HttpWebRequest
' Create the request using the WebRequestFactory.
wRq = CType(WebRequest.Create(url), HttpWebRequest)
With wRq
.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0b; Windows NT 5.1)"
.Headers.Add("Accept-Language", "zh-tw")
.Method = "GET"
.Timeout = 10000
If noCashe Then
Dim policy As New Cache.HttpRequestCachePolicy(Cache.HttpRequestCacheLevel.NoCacheNoStore)
.CachePolicy = policy
.Headers.Add("Cache-Control", "no-cache")
End If
End With
Try
' Return the response stream.
wRs = CType(wRq.GetResponse(), HttpWebResponse)
Dim streamResponse As Stream = wRs.GetResponseStream()
Dim streamRead As New StreamReader(streamResponse)
Dim responseString As String = streamRead.ReadToEnd()
getHTMLGet = responseString
' Close Stream object.
streamResponse.Close()
streamRead.Close()
' Release the HttpWebResponse.
wRs.Close()
Catch ex As Exception
Console.WriteLine(ex.ToString)
End Try
End Function
''' <summary>
''' 使用Post方法取得網頁內容
''' </summary>
''' <param name="url">網址</param>
''' <param name="postdata">傳遞參數,如a=123&b=456</param>
''' <param name="noCashe">不使用快取</param>
''' <returns>網頁的HTML</returns>
''' <remarks>使用Post方法取得網頁內容By allen080.blogspot.com</remarks>
Public Shared Function getHTMLPost(ByVal url As String, Optional ByVal postdata As String = Nothing, Optional ByVal noCashe As Boolean = False) As String
getHTMLPost = Nothing
Dim wRs As HttpWebResponse
Dim wRq As HttpWebRequest
' Create the request using the WebRequestFactory.
wRq = CType(WebRequest.Create(url), HttpWebRequest)
With wRq
.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0b; Windows NT 5.1)"
.Headers.Add("Accept-Language", "zh-tw")
.Method = "POST"
.Timeout = 10000
.KeepAlive = False
If noCashe Then
Dim policy As New Cache.HttpRequestCachePolicy(Cache.HttpRequestCacheLevel.NoCacheNoStore)
.CachePolicy = policy
.Headers.Add("Cache-Control", "no-cache")
End If
If Not postdata Is Nothing Then
.Timeout = 60000
Dim encoding As New System.Text.ASCIIEncoding()
Dim byte1 As Byte() = encoding.GetBytes(postdata)
.ContentType = "application/x-www-form-urlencoded"
.ContentLength = byte1.Length
.GetRequestStream().Write(byte1, 0, byte1.Length)
End If
End With
wRq.GetRequestStream().Close()
Try
' Return the response stream.
wRs = CType(wRq.GetResponse(), HttpWebResponse)
Dim streamResponse As Stream = wRs.GetResponseStream()
Dim streamRead As New StreamReader(streamResponse)
Dim responseString As String = streamRead.ReadToEnd()
getHTMLPost = responseString
' Close Stream object.
streamResponse.Close()
streamRead.Close()
' Release the HttpWebResponse.
wRs.Close()
Catch ex As Exception
Console.WriteLine(ex.ToString)
End Try
End Function
''' <summary>
''' 使用WebClient取得網頁內容
''' </summary>
''' <param name="url">網址</param>
''' <param name="postdata">傳遞的參數</param>
''' <param name="method">使用的方法,預設為POST</param>
''' <param name="noCashe">不使用快取</param>
''' <returns>網頁的HTML</returns>
''' <remarks>使用WebClient取得網頁內容By allen080.blogspot.com</remarks>
Public Shared Function getHTMLWebClient(ByVal url As String, ByRef postdata As Specialized.NameValueCollection, Optional ByVal method As String = "POST", Optional ByVal noCashe As Boolean = False) As String
getHTMLWebClient = Nothing
Try
Dim myWebClient As New WebClient()
If noCashe Then
Dim policy As New Cache.HttpRequestCachePolicy(Cache.HttpRequestCacheLevel.NoCacheNoStore)
myWebClient.CachePolicy = policy
myWebClient.Headers.Add("Cache-Control", "no-cache")
End If
myWebClient.Headers.Add("Content-Type", "application/x-www-form-urlencoded")
myWebClient.Headers.Add("Accept-Language", "zh-tw")
Dim responseArray As Byte() = myWebClient.UploadValues(url, method, postdata)
Dim encoding As New System.Text.UTF8Encoding
getHTMLWebClient = encoding.GetString(responseArray)
Catch ex As Exception
Console.WriteLine(ex.ToString)
End Try
End Function
''' <summary>
''' 取得網頁的Body區段
''' </summary>
''' <param name="url">網址</param>
''' <param name="postdata">傳遞的參數,若有值會使用getHTMLPost取得</param>
''' <param name="postValue">傳遞的參數,若有值會使用getHTMLWebClient取得</param>
''' <param name="noCashe">不使用快取</param>
''' <returns>網頁的HTML</returns>
''' <remarks>取得網頁的Body區段By allen080.blogspot.com</remarks>
Public Shared Function getHTMLBody(ByVal url As String, Optional ByVal postdata As String = Nothing, Optional ByVal postValue As Specialized.NameValueCollection = Nothing, Optional ByVal noCashe As Boolean = False) As String
getHTMLBody = Nothing
Dim html As String
If Not postValue Is Nothing Then
html = getHTMLWebClient(url, postValue, , noCashe)
ElseIf Not postdata Is Nothing Then
html = getHTMLPost(url, postdata, noCashe)
Else
html = getHTMLGet(url, noCashe)
End If
If Not html Is Nothing Then
getHTMLTagContain(html, "body", getHTMLBody)
End If
End Function
''' <summary>
''' 取得HTML中第一個符合的標籤內容的指標,並將取得的標籤內容寫入參數中
''' </summary>
''' <param name="html">HTML</param>
''' <param name="tag">標籤</param>
''' <param name="contain">取得的標籤內容</param>
''' <param name="indexEnd">此標籤結束於HTML的指標</param>
''' <returns>標籤內容的指標</returns>
''' <remarks>取得HTML中第一個符合的標籤內容的指標,並將取得的標籤內容寫入參數中By allen080.blogspot.com</remarks>
Public Shared Function getHTMLTagContain(ByRef html As String, ByVal tag As String, Optional ByRef contain As String = Nothing, Optional ByRef indexEnd As Integer = -1) As Integer
contain = Nothing
indexEnd = -1
Dim indexbBegin As Integer = -1
If Not html Is Nothing Then
indexbBegin = html.IndexOf("<" & tag)
If indexbBegin > -1 Then
indexbBegin = html.Substring(indexbBegin).IndexOf(">") + indexbBegin + 1
Dim indexbEnd As Integer = html.Substring(indexbBegin).IndexOf("</" & tag & ">") + indexbBegin
If indexbEnd > -1 + indexbBegin Then
indexEnd = indexbEnd
contain = html.Substring(indexbBegin, indexbEnd - indexbBegin)
End If
End If
End If
Return indexbBegin
End Function
''' <summary>
''' 取得HTML中第一個符合的標籤屬性的指標,並將取得的標籤內容與該屬性內容寫入參數中
''' </summary>
''' <param name="html">HTML</param>
''' <param name="tag">標籤</param>
''' <param name="attName">屬性名稱</param>
''' <param name="att">取得的屬性內容</param>
''' <param name="contain">取得的標籤內容</param>
''' <param name="indexEnd">此標籤結束於HTML的指標</param>
''' <returns></returns>
''' <remarks>取得HTML中第一個符合的標籤屬性的指標,並將取得的標籤內容與該屬性內容寫入參數中By allen080.blogspot.com</remarks>
Public Shared Function getHTMLTagAtt(ByRef html As String, ByVal tag As String, ByVal attName As String, Optional ByRef att As String = Nothing, Optional ByRef contain As String = Nothing, Optional ByRef indexEnd As Integer = -1) As Integer
att = Nothing
contain = Nothing
indexEnd = -1
Dim indexbBegin As Integer = -1
Dim indexaBegin As Integer = -1
If Not html Is Nothing Then
indexbBegin = html.IndexOf("<" & tag)
If indexbBegin > -1 Then
Dim indexbEnd As Integer = html.Substring(indexbBegin).IndexOf(">") + indexbBegin
If indexbEnd > -1 + indexbBegin Then
indexaBegin = html.Substring(indexbBegin, indexbEnd - indexbBegin).Replace("'", """").IndexOf(attName & "=""") + indexbBegin + (attName & "=""").Length
If indexaBegin > -1 + indexbBegin Then
Dim indexaEnd As Integer = html.Substring(indexaBegin).Replace("'", """").IndexOf("""") + indexaBegin
If indexaEnd > -1 + indexaBegin Then
att = html.Substring(indexaBegin, indexaEnd - indexaBegin)
End If
End If
End If
indexbBegin = indexbEnd + 1
indexbEnd = html.Substring(indexbBegin).IndexOf("</" & tag & ">") + indexbBegin
If indexbEnd > -1 + indexbBegin Then
indexEnd = indexbEnd
contain = html.Substring(indexbBegin, indexbEnd - indexbBegin)
End If
End If
End If
Return indexaBegin
End Function
''' <summary>
''' Url參數值編碼
''' </summary>
''' <param name="value">參數值</param>
''' <returns>編碼結果</returns>
''' <remarks>Url參數值編碼By allen080.blogspot.com</remarks>
Public Shared Function getEncodeStr(ByVal value As String)
Return Web.HttpUtility.UrlEncode(value)
End Function
''' <summary>
''' Url參數值解碼
''' </summary>
''' <param name="value">參數值</param>
''' <returns>解碼結果</returns>
''' <remarks>Url參數值解碼By allen080.blogspot.com</remarks>
Public Shared Function getDecodeStr(ByVal value As String)
Return Web.HttpUtility.UrlDecode(value)
End Function
End Class
使用範例程式
Dim url As String = "http://allen080.blogspot.com/"
'使用WebRequest的Get方法取得整個網頁的HTML
Dim htmlAll As String = WebPageGenFunc.getHTMLGet(url)
'取得網頁Body的部份
Dim htmlBody As String = WebPageGenFunc.getHTMLBody(url)
Dim indexEnd As Integer = htmlBody.IndexOf("<h3 class='post-title entry-title'>")
Dim valueAttribute As String = Nothing
Dim valueContain As String = Nothing
If indexEnd > 0 Then
htmlBody = htmlBody.Substring(indexEnd)
'取得a標籤1的內容
WebPageGenFunc.getHTMLTagContain(htmlBody, "a", valueContain, indexEnd)
Console.WriteLine("a標籤1的內容: " & valueContain)
htmlBody = htmlBody.Substring(indexEnd)
'取得a標籤2的內容與href的屬性
WebPageGenFunc.getHTMLTagAtt(htmlBody, "a", "href", valueAttribute, valueContain, indexEnd)
Console.WriteLine("a標籤2的內容與href的屬性: " & valueAttribute & vbTab & ",內容:" & valueContain)
End If
'使用WebClient的Post方法取得資料
url = "http://msdn.microsoft.com/zh-tw/library/cc983670.aspx"
Dim searchStr As String = "倫倫3號"
Dim postdata As New Specialized.NameValueCollection
postdata.Add("__EVENTTARGET", "")
postdata.Add("__EVENTARGUMENT", "")
postdata.Add("__VIEWSTATE", "/wEPDwUBMGQYAgUeX19Db250cm9sc1JlcXVpcmVQb3N0QmFja0tleV9fFgYFImN0bDAwJE1hc3RoZWFkMSRjdGwwMCRTZWFyY2hCdXR0b24FImN0bDAwJFdpa2lDb250ZW50JEVkaXRvciRSYWRFZGl0b3IFR2N0bDAwJFdpa2lDb250ZW50JEVkaXRvciRjdGwwMF9XaWtpQ29udGVudF9FZGl0b3JfUmFkRWRpdG9yZGlhbG9nT3BlbmVyBU5jdGwwMCRXaWtpQ29udGVudCRFZGl0b3IkY3RsMDBfV2lraUNvbnRlbnRfRWRpdG9yX1JhZEVkaXRvcmRpYWxvZ09wZW5lcl9XaW5kb3cFI2N0bDAwJFdpa2lDb250ZW50JEVkaXRvciRSYWRUb29sdGlwBR5jdGwwMCRXaWtpQ29udGVudCRFZGl0b3IkY3RsMDAFL2N0bDAwJE1hc3RoZWFkMSRVc2VyUmVjb2duaXRpb25GbHlvdXQkbG9naW5WaWV3Dw9kAgFkTR/9R2uARDjXggFEm6e0WaTZWHE=")
postdata.Add("ctl00$Masthead1$ctl00$SearchTextBox", searchStr)
postdata.Add("ctl00$Masthead1$ctl00$WebButton", "Web")
postdata.Add("go", "true")
htmlBody = WebPageGenFunc.getHTMLBody(url, , postdata)
indexEnd = htmlBody.IndexOf("<span class=""sb_count""")
If indexEnd > 0 Then
htmlBody = htmlBody.Substring(indexEnd)
'取得a標籤1的內容
WebPageGenFunc.getHTMLTagContain(htmlBody, "span", valueContain, indexEnd)
Console.WriteLine("使用WebClient的Post方法取得資料: " & valueContain)
End If
'使用WebRequest的Post方法取得資料
searchStr = String.Empty
'在組合參數時要同時編碼
For i As Integer = 0 To postdata.Count - 1
searchStr &= "&" & postdata.GetKey(i) & "=" & WebPageGenFunc.getEncodeStr(postdata(i))
Next
searchStr = searchStr.Substring(1)
htmlBody = WebPageGenFunc.getHTMLBody(url, searchStr)
indexEnd = htmlBody.IndexOf("<span class=""sb_count""")
If indexEnd > 0 Then
htmlBody = htmlBody.Substring(indexEnd)
'取得a標籤1的內容
WebPageGenFunc.getHTMLTagContain(htmlBody, "span", valueContain, indexEnd)
Console.WriteLine("使用WebRequest的Post方法取得資料: " & valueContain)
End If
執行結果
a標籤1的內容: [程式]VB.Net2.0資料庫通用存取函式
a標籤2的內容與href的屬性: http://allen080.blogspot.com/2009/04/vbnet20.html ,內容:<abbr class='published' title='2009-04-20T22:25:00-07:00'>下午 10:25</abbr>
使用WebClient的Post方法取得資料: 第 1-9 筆,共 159 筆搜尋結果
使用WebRequest的Post方法取得資料: 第 1-9 筆,共 159 筆搜尋結果


























訂閱此Blog

