[程式]VB.Net2.0網頁原始碼取得函式

序言

我因為學術需求,要取得某些網址的網頁原始碼進行分析,因此寫了以下函式,貼出來與大家分享吧~

*2009/10/29 更新函式,加強切割網頁函式、抓網頁標頭。

開發環境

  • VB.Net 2.0 (VS2005)

專案設定

  • 建立專案後預設的專案屬性下,參考的元件如下圖:

  • 下面的程式中會用到【Web】這個物件,所以我們需要加入【System.Web】這個元件:

    點選加入,在.Net分頁下找到【System.Web】這個元件

網頁原始碼函式類別

Imports System.Net
Imports System.io
Imports System.Reflection

Public Class WebPageGenFunc
    Private Shared rd As New Random(Now.Second)
    Private Shared sec As Integer
    Private Shared lastsec As Integer = -1
    Public Shared Function sleepTime(Optional ByVal minimatime As Integer = 2600, Optional ByVal rangeSecond As Integer = 5) As Integer
        sec = (rd.Next Mod rangeSecond) * 1000 + minimatime
        While sec = lastsec
            sec = (rd.Next Mod rangeSecond) * 1000 + minimatime
        End While
        lastsec = sec
        System.Threading.Thread.Sleep(sec)
        Return sec
    End Function


    Private Shared Sub SetAllowUnsafeHeaderParsing20()
        Dim a As New System.Net.Configuration.SettingsSection
        Dim aNetAssembly As System.Reflection.Assembly = Assembly.GetAssembly(a.GetType)
        Dim aSettingsType As Type = aNetAssembly.GetType("System.Net.Configuration.SettingsSectionInternal")
        Dim args As Object() = Nothing
        Dim anInstance As Object = aSettingsType.InvokeMember("Section", BindingFlags.Static Or BindingFlags.GetProperty Or BindingFlags.NonPublic, Nothing, Nothing, args)
        Dim aUseUnsafeHeaderParsing As FieldInfo = aSettingsType.GetField("useUnsafeHeaderParsing", BindingFlags.NonPublic Or BindingFlags.Instance)
        aUseUnsafeHeaderParsing.SetValue(anInstance, True)
    End Sub


    ''' <summary>
    ''' 使用Get方法取得網頁內容
    ''' </summary>
    ''' <param name="url">網址</param>
    ''' <param name="noCashe">不使用快取</param>
    ''' <returns>網頁的HTML</returns>
    ''' <remarks>使用Get方法取得網頁內容</remarks>
    Public Shared Function getHTMLGet(ByVal url As String, Optional ByVal noCashe As Boolean = False) As String
        getHTMLGet = Nothing
        SetAllowUnsafeHeaderParsing20()
        Dim wRs As HttpWebResponse
        Dim wRq As HttpWebRequest
        ' Create the request using the WebRequestFactory.

        wRq = CType(WebRequest.Create(url), HttpWebRequest)
        With wRq
            .UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"
            .ContentType = "application/x-www-form-urlencoded"
            .Headers.Add("Accept-Language", "zh-tw")
            .Method = "GET"
            .Timeout = 10000
            If noCashe Then
                Dim policy As New Cache.HttpRequestCachePolicy(Cache.HttpRequestCacheLevel.NoCacheNoStore)
                .CachePolicy = policy
                .Headers.Add("Cache-Control", "no-cache")
            End If
        End With

        Try
            ' Return the response stream.
            wRs = CType(wRq.GetResponse(), HttpWebResponse)
            Dim streamResponse As Stream = wRs.GetResponseStream()
            Dim streamRead As New StreamReader(streamResponse)
            Dim responseString As String = streamRead.ReadToEnd()
            getHTMLGet = responseString
            ' Close Stream object.
            streamResponse.Close()
            streamRead.Close()
            ' Release the HttpWebResponse.
            wRs.Close()
        Catch ex As Exception
            Console.WriteLine(ex.ToString)
        End Try
    End Function

    ''' <summary>
    ''' 使用Post方法取得網頁內容
    ''' </summary>
    ''' <param name="url">網址</param>
    ''' <param name="postdata">傳遞參數,如a=123&b=456</param>
    ''' <param name="noCashe">不使用快取</param>
    ''' <returns>網頁的HTML</returns>
    ''' <remarks>使用Post方法取得網頁內容</remarks>
    Public Shared Function getHTMLPost(ByVal url As String, Optional ByVal postdata As String = Nothing, Optional ByVal noCashe As Boolean = False) As String
        getHTMLPost = Nothing
        SetAllowUnsafeHeaderParsing20()
        Dim wRs As HttpWebResponse
        Dim wRq As HttpWebRequest
        ' Create the request using the WebRequestFactory.
        wRq = CType(WebRequest.Create(url), HttpWebRequest)
        With wRq
            .UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"
            .Headers.Add("Accept-Language", "zh-tw")
            .Method = "POST"
            .Timeout = 10000
            .KeepAlive = False
            If noCashe Then
                Dim policy As New Cache.HttpRequestCachePolicy(Cache.HttpRequestCacheLevel.NoCacheNoStore)
                .CachePolicy = policy
                .Headers.Add("Cache-Control", "no-cache")
            End If
            If Not postdata Is Nothing Then
                .Timeout = 60000
                Dim encoding As New System.Text.ASCIIEncoding()
                Dim byte1 As Byte() = encoding.GetBytes(postdata)
                .ContentType = "application/x-www-form-urlencoded"
                .ContentLength = byte1.Length
                .GetRequestStream().Write(byte1, 0, byte1.Length)
            End If
        End With
        wRq.GetRequestStream().Close()
        Try
            ' Return the response stream.
            wRs = CType(wRq.GetResponse(), HttpWebResponse)
            Dim streamResponse As Stream = wRs.GetResponseStream()
            Dim streamRead As New StreamReader(streamResponse)
            Dim responseString As String = streamRead.ReadToEnd()
            getHTMLPost = responseString
            ' Close Stream object.
            streamResponse.Close()
            streamRead.Close()
            ' Release the HttpWebResponse.
            wRs.Close()

        Catch ex As Exception
            Console.WriteLine(ex.ToString)
        End Try
    End Function
    ''' <summary>
    ''' 使用WebClient取得網頁內容
    ''' </summary>
    ''' <param name="url">網址</param>
    ''' <param name="postdata">傳遞的參數</param>
    ''' <param name="method">使用的方法,預設為POST</param>
    ''' <param name="noCashe">不使用快取</param>
    ''' <returns>網頁的HTML</returns>
    ''' <remarks>使用WebClient取得網頁內容</remarks>
    Public Shared Function getHTMLWebClient(ByVal url As String, ByRef postdata As Specialized.NameValueCollection, Optional ByVal method As String = "POST", Optional ByVal noCashe As Boolean = False) As String
        getHTMLWebClient = Nothing
        Try
            SetAllowUnsafeHeaderParsing20()
            Dim myWebClient As New WebClient()
            If noCashe Then
                Dim policy As New Cache.HttpRequestCachePolicy(Cache.HttpRequestCacheLevel.NoCacheNoStore)
                myWebClient.CachePolicy = policy
                myWebClient.Headers.Add("Cache-Control", "no-cache")
            Else
                Dim rheaders As WebHeaderCollection = myWebClient.ResponseHeaders
                If Not rheaders Is Nothing Then
                    Dim header As String = rheaders("Set-Cookie")
                    If Not header Is Nothing Then
                        myWebClient.Headers.Add("Cookie", header)
                    End If
                End If
            End If
            myWebClient.Headers.Add("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)")
            myWebClient.Headers.Add("Content-Type", "application/x-www-form-urlencoded")
            myWebClient.Headers.Add("Accept-Language", "zh-tw")
            If postdata Is Nothing Then postdata = New Specialized.NameValueCollection
            Dim responseArray As Byte() = myWebClient.UploadValues(url, method, postdata)
            Dim encoding As New System.Text.UTF8Encoding
            getHTMLWebClient = encoding.GetString(responseArray)
        Catch ex As Exception
            Console.WriteLine(ex.ToString)
        End Try
    End Function

    ''' <summary>
    ''' 取得網頁的Body區段
    ''' </summary>
    ''' <param name="url">網址</param>
    ''' <param name="postdata">傳遞的參數,若有值會使用getHTMLPost取得</param>
    ''' <param name="postValue">傳遞的參數,若有值會使用getHTMLWebClient取得</param>
    ''' <param name="noCashe">不使用快取</param>
    ''' <returns>網頁的HTML</returns>
    ''' <remarks>取得網頁的Body區段</remarks>
    Public Shared Function getHTMLBody(ByVal url As String, Optional ByVal postdata As String = Nothing, Optional ByVal postValue As Specialized.NameValueCollection = Nothing, Optional ByVal noCashe As Boolean = False, Optional ByVal useWebClient As Boolean = False) As String
        getHTMLBody = Nothing
        Dim html As String
        If Not postValue Is Nothing Then
            html = getHTMLWebClient(url, postValue, , noCashe)
        ElseIf Not postdata Is Nothing Then
            html = getHTMLPost(url, postdata, noCashe)
        Else
            If useWebClient Then
                html = getHTMLWebClient(url, Nothing, "GET", noCashe)
            Else
                html = getHTMLGet(url, noCashe)
            End If

        End If
        If Not html Is Nothing Then
            getHTMLTagContain(html, "body", getHTMLBody)
        End If
    End Function
    ''' <summary>
    ''' 取得HTML中第一個符合的標籤內容的指標,並將取得的標籤內容寫入參數中
    ''' </summary>
    ''' <param name="html">HTML</param>
    ''' <param name="tag">標籤</param>
    ''' <param name="contain">取得的標籤內容</param>
    ''' <param name="indexEnd">此標籤結束於HTML的指標</param>
    ''' <returns>標籤內容的指標</returns>
    ''' <remarks>取得HTML中第一個符合的標籤內容的指標,並將取得的標籤內容寫入參數中</remarks>
    Public Shared Function getHTMLTagContain(ByVal html As String, ByVal tag As String, Optional ByRef contain As String = Nothing, Optional ByRef indexEnd As Integer = -1) As Integer
        tag = tag.ToLower
        contain = Nothing
        indexEnd = -1
        Dim indexBegin As Integer = -1
        Dim indexbBegin As Integer = -1
        If Not html Is Nothing Then
            indexBegin = html.ToLower.IndexOf("<" & tag)
            If indexBegin > -1 Then
                indexbBegin = html.IndexOf(">", indexBegin)
                If indexbBegin > -1 Then
                    indexbBegin += 1
                End If
                Dim findTag As Boolean = False
                indexEnd = indexbBegin
                Dim lastStart As Integer = indexbBegin
                Dim stopLimit2 As Integer = 9999
                Do
                    indexEnd = html.ToLower.IndexOf("</" & tag, indexEnd)
                    If indexEnd > -1 Then
                        If html.Substring(lastStart, indexEnd - lastStart).IndexOf("<" & tag) > -1 Then
                            lastStart = indexEnd
                            indexEnd = indexEnd + ("</" & tag).Length
                            findTag = True
                        Else
                            findTag = False
                        End If
                    Else
                        findTag = False
                    End If
                    stopLimit2 -= 1
                Loop While findTag And stopLimit2 > 0

                If indexEnd > -1 Then
                    contain = html.Substring(indexbBegin, indexEnd - indexbBegin)

                    indexEnd = html.IndexOf(">", indexEnd)
                    If indexEnd > -1 Then
                        indexEnd += 1
                    End If
                End If

            End If
        End If
        Return indexbBegin
    End Function
    ''' <summary>
    ''' 取得HTML中第一個符合的標籤屬性的指標,並將取得的標籤內容與該屬性內容寫入參數中
    ''' </summary>
    ''' <param name="html">HTML</param>
    ''' <param name="tag">標籤</param>
    ''' <param name="attName">屬性名稱</param>
    ''' <param name="att">取得的屬性內容</param>
    ''' <param name="contain">取得的標籤內容</param>
    ''' <param name="indexEnd">此標籤結束於HTML的指標</param>
    ''' <returns></returns>
    ''' <remarks>取得HTML中第一個符合的標籤屬性的指標,並將取得的標籤內容與該屬性內容寫入參數中</remarks>
    Public Shared Function getHTMLTagAtt(ByVal html As String, ByVal tag As String, ByVal attName As String, Optional ByRef att As String = Nothing, Optional ByRef contain As String = Nothing, Optional ByRef indexEnd As Integer = -1) As Integer
        att = Nothing
        contain = Nothing
        tag = tag.ToLower
        attName = attName.ToLower
        indexEnd = -1
        Dim indexBegin As Integer = -1
        Dim indexaBegin As Integer = -1
        Dim indexbEnd As Integer = -1
        Dim findTag As Boolean = False
        Dim sign As String

        If Not html Is Nothing Then
            indexBegin = 0
            Dim stopLimit1 As Integer = 9999
            Do
                indexbEnd = -1
                indexBegin = html.ToLower.IndexOf("<" & tag, indexBegin)
                If indexBegin > -1 Then
                    indexbEnd = html.IndexOf(">", indexBegin)
                    If indexbEnd > -1 Then
                        indexaBegin = html.Substring(0, indexbEnd).Replace("""", "'").ToLower.IndexOf(attName & "='", indexBegin)
                        If indexaBegin > -1 Then
                            sign = html.Substring(indexaBegin + (attName & "=").Length, 1)
                            indexaBegin += (attName & "='").Length
                            Dim indexaEnd As Integer = html.Substring(0, indexbEnd).IndexOf(sign, indexaBegin)
                            If indexaEnd > -1 Then
                                att = html.Substring(indexaBegin, indexaEnd - indexaBegin)
                            End If
                        Else
                            indexBegin = indexbEnd + 1
                            Continue Do
                        End If
                        indexbEnd += 1
                    End If

                    findTag = False
                    indexEnd = indexbEnd
                    Dim lastStart As Integer = indexbEnd

                    Dim stopLimit2 As Integer = 9999
                    Do
                        indexEnd = html.ToLower.IndexOf("</" & tag, indexEnd)
                        If indexEnd > -1 Then
                            If html.Substring(lastStart, indexEnd - lastStart).IndexOf("<" & tag) > -1 Then
                                lastStart = indexEnd
                                indexEnd = indexEnd + ("</" & tag).Length
                                findTag = True
                            Else
                                findTag = False
                            End If
                        Else
                            findTag = False
                        End If
                        stopLimit2 -= 1
                    Loop While findTag And stopLimit2 > 0

                    If indexEnd > -1 Then
                        contain = html.Substring(indexbEnd, indexEnd - indexbEnd)
                        indexEnd = html.IndexOf(">", indexEnd)
                        If indexEnd > -1 Then
                            indexEnd += 1
                        End If
                    End If
                Else
                    Exit Do
                End If
                stopLimit1 -= 1
            Loop While att Is Nothing And stopLimit1 > 0
        End If
        Return indexbEnd
    End Function
    ''' <summary>
    ''' Url參數值編碼
    ''' </summary>
    ''' <param name="value">參數值</param>
    ''' <returns>編碼結果</returns>
    ''' <remarks>Url參數值編碼</remarks>
    Public Shared Function getEncodeStr(ByVal value As String)
        Return Web.HttpUtility.UrlEncode(value)
    End Function
    ''' <summary>
    ''' Url參數值解碼
    ''' </summary>
    ''' <param name="value">參數值</param>
    ''' <returns>解碼結果</returns>
    ''' <remarks>Url參數值解碼</remarks>
    Public Shared Function getDecodeStr(ByVal value As String)
        Return Web.HttpUtility.UrlDecode(value)
    End Function
End Class

使用範例程式

        Dim indexEnd As Integer
        Dim valueAttribute As String
        Dim valueContain As String
        Dim url As String = "http://allen080.blogspot.com/2009/05/vbnet20.html"
        '使用WebRequest的Get方法取得整個網頁的HTML
        'Dim htmlAll As String = WebPageGenFunc.getHTMLGet(url)
        '取得網頁Body的部份
        Dim htmlBody As String = WebPageGenFunc.getHTMLBody(url)
        If Not htmlBody Is Nothing Then
            indexEnd = htmlBody.IndexOf("<span class='item-control blog-admin'>")
            valueAttribute = Nothing
            valueContain = Nothing
            If indexEnd > 0 Then
                htmlBody = htmlBody.Substring(indexEnd)
                '取得a標籤1的內容
                WebPageGenFunc.getHTMLTagContain(htmlBody, "span", valueContain, indexEnd)
                Console.WriteLine("span標籤的內容: " & valueContain)
                'htmlBody = htmlBody.Substring(indexEnd)
                '取得a標籤2的內容與href的屬性
                WebPageGenFunc.getHTMLTagAtt(htmlBody, "a", "onclick", valueAttribute, valueContain, indexEnd)
                Console.WriteLine("a標籤的內容與onclick的屬性: " & valueAttribute & vbTab & ",內容:" & valueContain)
            End If
        End If
        

        '使用WebClient的Post方法取得資料
        url = "http://al080.summerhost.info/invoiceMe/index.php"
        Dim searchStr As String = "123"
        Dim postdata As New Specialized.NameValueCollection
        postdata.Add("ddlYear", "2009")
        postdata.Add("ddlMonth", "7-8")
        postdata.Add("txtInvNO", searchStr)
        postdata.Add("btnMatchInv", "對獎")
        htmlBody = WebPageGenFunc.getHTMLBody(url, , postdata)
        If Not htmlBody Is Nothing Then
            indexEnd = htmlBody.IndexOf("<div class=""DivRight"">")
            If indexEnd > 0 Then
                htmlBody = htmlBody.Substring(indexEnd)
                '取得div標籤的內容
                WebPageGenFunc.getHTMLTagContain(htmlBody, "div", valueContain, indexEnd)
                Console.WriteLine("使用WebClient的Post方法取得資料: " & valueContain)
            End If
        End If
        
        '使用WebRequest的Post方法取得資料
        searchStr = String.Empty
        '在組合參數時要同時編碼
        For i As Integer = 0 To postdata.Count - 1
            searchStr &= "&" & postdata.GetKey(i) & "=" & WebPageGenFunc.getEncodeStr(postdata(i))
        Next
        searchStr = searchStr.Substring(1)
        htmlBody = WebPageGenFunc.getHTMLBody(url, searchStr)
        If Not htmlBody Is Nothing Then
            indexEnd = htmlBody.IndexOf("<div class=""DivRight"">")
            If indexEnd > 0 Then
                htmlBody = htmlBody.Substring(indexEnd)
                '取得div標籤的內容
                WebPageGenFunc.getHTMLTagContain(htmlBody, "div", valueContain, indexEnd)
                Console.WriteLine("使用WebRequest的Post方法取得資料: " & valueContain)
            End If
        End If

執行結果

span標籤的內容:
<a class='quickedit' href='http://www.blogger.com/rearrange?blogID=2698062899592178296&widgetType=HTML&widgetId=HTML1&action=editWidget' onclick='return _WidgetManager._PopupConfig(document.getElementById("HTML1"));' target='configHTML1' title='編輯'>
<img alt='' height='18' src='http://img1.blogblog.com/img/icon18_wrench_allbkg.png' width='18'/>
</a>

a標籤的內容與onclick的屬性: return _WidgetManager._PopupConfig(document.getElementById("HTML1"));    ,內容:
<img alt='' height='18' src='http://img1.blogblog.com/img/icon18_wrench_allbkg.png' width='18'/>

使用WebClient的Post方法取得資料: <span style="font-weight:bold;">對獎結果:</span><br>
                <div class="DivResult">
            無中獎發票...<br>
                </div>
           
使用WebRequest的Post方法取得資料: <span style="font-weight:bold;">對獎結果:</span><br>
                <div class="DivResult">
            無中獎發票...<br>
                </div>

相關連結

這裡是關於技術的手札~

也歡迎大家到

倫與貓的足跡



到噗浪來

關心一下我唷!
by 倫
 
Copyright 2009 倫倫3號Beta-Log All rights reserved.
Blogger Templates created by Deluxe Templates
Wordpress Theme by EZwpthemes