首页 诗词 字典 板报 句子 名言 友答 励志 学校 网站地图
当前位置: 首页 > 教程频道 > 开发语言 > VB >

vb中inte获取网页内的所有超链接,该怎么解决

2012-02-01 
vb中inte获取网页内的所有超链接vb中inte获取网页内的所有超链接 怎么获取包括正则转化不要用webbrower我

vb中inte获取网页内的所有超链接
vb中inte获取网页内的所有超链接 怎么获取
包括正则转化
不要用webbrower
我想它实现以下功能
输入网站 自动检索所有链接,并输出所有超链接
判断错误网页 生成地图

以下下载的源码 不是很懂 希望高手解释下
新手 不在乎分 下面代码有错误

VB code
Private Sub cmdExit_Click() 
  Unload Me
End Sub

Private Sub cmdGoWander_Click()
  StopSearching = False
  rtbLinkNames.Text = ""
  On Error Resume Next
  Close #1
  On Error GoTo 0
  ' Open CurDir & "\HTTPFILE.TXT" For Append As #1
  Current_Pos = 1
  Me.MousePointer = vbArrowHourglass
  cmdGoWander.Enabled = False
  cmdStopWandering.Enabled = True
  If Get_File(txtSite.Text) Then
    If Not Parse Then
      Me.MousePointer = vbNormal
      cmdGoWander.Enabled = True
      cmdStopWandering.Enabled = False
      Exit Sub
    End If
  Else
    Me.MousePointer = vbNormal
    MsgBox "Unable to find the desired site."
  End If
  cmdGoWander.Enabled = True
  cmdStopWandering.Enabled = False
  Me.MousePointer = vbNormal
  StopSearching = True
End Sub

Private Sub cmdStopWandering_Click()
  StopSearching = True
End Sub

Private Sub Form_Load()
  txtSite.Text = "http://www.jamsa.com/jamsa1.htm"
  NewLine = Chr(13) & Chr(10)
  StopSearching = True
  cmdStopWandering.Enabled = False
End Sub

Private Sub Form_QueryUnload(Cancel As Integer, UnloadMode As Integer)
  Dim Response As Integer
 
  If Not StopSearching Then
    Response = MsgBox("Stop search and lose updated file?", vbYesNo)
    If Response = vbNo Then
      Cancel = 1
      Exit Sub
    End If
  Else
    Response = MsgBox("Close program?", vbYesNo)
    If Response = vbNo Then
      Cancel = 1
      Exit Sub
    End If
  End If
  itcWander.Cancel
End Sub

Private Sub Form_Unload(Cancel As Integer)
  Unload frmSearching
  Unload frmParsing
End Sub

Private Sub itcWander_StateChanged(ByVal State As Integer)
  Dim strMess As String ' Message variable.
 
  DoEvents
  Select Case State
    Case 1                    'Resolving host
      Connected = False
 
    Case icConnected
      Connected = True
     
    Case State = icError
      strMess = "ErrorCode: " & itcWander.ResponseCode & " : " & itcSearch.ResponseInfo
      MsgBox strMess, vbOKOnly
      itcWanderSearchError = True
    End Select
End Sub


VB code
Option Explicit 
Option Compare Text

Const TAG_LENGTH% = 1000
Const OUT_FILE = "\taglist.txt"
Public Current_Pos As Long
Public Tag As String
Public Real_File_Name As String
Public File_Name As String
Public Site As String
Public Location As String
Public Site_Length As Integer
Public NewLine As String
Public SiteContents As String
Public inetSearchError As Boolean, StopSearching As Boolean

Public Function TrimPage(ByVal Address As String) As String


  Do While Right$(Address, 1) <> "/"
    Address = Left$(Address, Len(Address) - 1)
  Loop
  TrimPage = Address
End Function
Private Function ResolvedSite(FileAddr As String, Parent As String, NewTag As String) As Boolean
  'On Error GoTo ResolveError
  ResolvedSite = True
  Parent = FileAddr
  If Right$(Parent, 1) <> "/" Then
    Parent = TrimPage(Parent)
  End If
  If Left$(NewTag, 3) <> "../" And Left$(NewTag, 5) <> "http:" Then
    Exit Function
  End If
  If Left$(NewTag, 6) = "http:/" And Left$(NewTag, 7) <> "http://" Then
    NewTag = Right$(NewTag, Len(NewTag) - 6)
  End If
  Do While Left$(NewTag, 3) = "../"
    NewTag = Right$(NewTag, Len(NewTag) - 3)
    Parent = Left(Parent, Len(Parent) - 1)
    Do While Right$(Parent, 1) <> "/"
      Parent = Left$(Parent, Len(Parent) - 1)
    Loop
  Loop
Exit Function

ResolveError:
  ResolvedSite = False
  MsgBox "Unable to resolve parent site!"
End Function
Public Function Get_File(ByVal txtURL As String) As Boolean
  frmSearching.Hide
  frmSearching.lblSite.Caption = txtURL
  If Len(txtURL) > 40 Then
    frmSearching.lblSite.Width = Len(txtURL) * 73
    frmSearching.lblCaption.Width = frmSearching.lblSite.Width
    frmSearching.Width = frmSearching.lblCaption.Width + 435
  End If
  frmSearching.Show
  DoEvents
  Real_File_Name = txtURL
  Site = Real_File_Name
  Site_Length = Len(Site)
  inetSearchError = False
  frmWanderer.itcWander.RequestTimeout = 60
  frmWanderer.itcWander.AccessType = icUseDefault
  On Error Resume Next
  SiteContents = frmWanderer.itcWander.OpenURL(txtURL, icString)
  Unload frmSearching
  DoEvents
  If Err.Number <> 0 And Not inetSearchError Then
    Get_File = False
    Exit Function
  End If
  Get_File = True
End Function
Public Sub AddLink(LinktoAdd As String)
  Dim FoundPos As Integer
   
  FoundPos = 0
  FoundPos = frmWanderer.rtbLinkNames.Find(LinktoAdd, FoundPos)
  If FoundPos <> -1 Then                'the phrase was found.
    Exit Sub
  Else
    frmWanderer.rtbLinkNames.Text = frmWanderer.rtbLinkNames.Text & LinktoAdd & NewLine
  End If
End Sub
Public Function Parse() As Boolean
  Dim PositionInString As Long, Response As Integer, ThisLinkLength As Integer
  Dim End_Of_List As Boolean, NewFileName As String, GotFile As Boolean, Parent As String
  Dim Done As Boolean, Tag As String, lclTag As String, AddToFileString As String, RelativeAddress As Boolean
  Dim lclTag_Length As Integer, I As Integer, FirstQuote As Integer, SecondQuote As Integer
 
  End_Of_List = False
  PositionInString = 0
  Done = False
  If Not Initialize_OutputFile() Then Exit Function
  Do While Not End_Of_List And Not StopSearching
  Current_Pos = 1
    Done = Get_Tag(Tag)
    Do While Not Done And Not StopSearching
      frmParsing.Show
      DoEvents
      lclTag = Tag
      lclTag_Length = Len(Tag)


      FirstQuote = 0
      SecondQuote = 0
      If InStr(lclTag, "href") Then
        Do While Left$(lclTag, 4) <> "href"
          lclTag = Right$(lclTag, Len(lclTag) - 1)
        Loop
        If Not InStr(lclTag, "::") Then
          RelativeAddress = True
        Else
          RelativeAddress = False
        End If
        For I = 1 To lclTag_Length
          If Mid$(lclTag, I, 1) = Chr(34) Then
            If FirstQuote <> 0 Then
              SecondQuote = I
              Exit For
            Else
              FirstQuote = I + 1
            End If
          End If
        Next
        AddToFileString = Mid$(lclTag, FirstQuote, SecondQuote - FirstQuote)
        If InStr(AddToFileString, "://") Then
          AddLink (AddToFileString)
        Else
          If Not ResolvedSite(Site, Parent, AddToFileString) Then
              frmParsing.Hide
              MsgBox "Unable to resolve site!"
          Else
            AddLink (Parent & AddToFileString)
          End If
        End If
      End If
      Done = Get_Tag(Tag)
      DoEvents
    Loop
    frmParsing.Hide
    If Done Then
      If Len(frmWanderer.rtbLinkNames.Text) > 0 Then frmWanderer.rtbLinkNames.SaveFile App.Path & OUT_FILE, rtfText
      GotFile = False
    Else
      Response = MsgBox("Are you sure you want to stop search?", vbYesNo)
      If Response = vbYes Then
        frmWanderer.rtbLinkNames.SaveFile App.Path & OUT_FILE, rtfText
        frmWanderer.itcWander.Cancel
        Parse = Not StopSearching
        Exit Function
      End If
    End If
    DoEvents
    Do Until GotFile Or StopSearching
      If PositionInString < Len(frmWanderer.rtbLinkNames.Text) Then
        ThisLinkLength = 0
        If PositionInString = 0 Then PositionInString = 1
        Do While Mid$(frmWanderer.rtbLinkNames.Text, PositionInString + ThisLinkLength, 1) <> Chr(10)
          ThisLinkLength = ThisLinkLength + 1
          DoEvents
        Loop
        NewFileName = Mid$(frmWanderer.rtbLinkNames.Text, PositionInString, ThisLinkLength - 1)
        If Left$(NewFileName, 6) <> "mailto" Then
          PositionInString = PositionInString + ThisLinkLength + 1
          ThisLinkLength = 0
          If Not Get_File(NewFileName) Then
            MsgBox "Error opening page. Moving on to next page. Bad page = " & NewFileName


            GotFile = False
          Else
            GotFile = True
          End If
        Else
          GotFile = False
        End If
      Else
        GotFile = True
        End_Of_List = True
      End If
      DoEvents
    Loop
'    frmWanderer.rtbLinkNames.Text = AddToFileString
  Loop
  Parse = Not StopSearching
End Function
Public Function Get_Tag(ReturnTag As String) As Boolean
  ReturnTag = ""
  Get_Tag = False
 
    Do While Current_Pos < Len(SiteContents)
      If Mid(SiteContents, Current_Pos, 1) = " <" And Mid(SiteContents, Current_Pos + 1, 1) = "A" Then
        Dim Local_I As Integer
       
        Local_I = 1
        Do While Mid(SiteContents, Current_Pos + Local_I, 1) <> ">"
          If Local_I < TAG_LENGTH Then
            ReturnTag = ReturnTag & Mid(SiteContents, Current_Pos + Local_I, 1)
          End If
          Local_I = Local_I + 1
        Loop
        Current_Pos = Current_Pos + Local_I
        Exit Function
      End If
      Current_Pos = Current_Pos + 1
    Loop
  Get_Tag = True
End Function
Public Function Initialize_OutputFile() As Boolean
  If Dir(App.Path & OUT_FILE) <> "" Then
    On Error Resume Next
    Kill App.Path & OUT_FILE
    If Err.Number <> 0 Then
      MsgBox "Unable to open output file.", vbCritical
      Initialize_OutputFile = False
      Exit Function
    End If
  End If
  Open App.Path & OUT_FILE For Append As #1
  Close #1
  Initialize_OutputFile = True
  Exit Function
End Function



[解决办法]
东拼西凑的代码,这玩意完成到哪一步了?
分析源代码,用正则,最笨的办法是用instr搜索有没有http标记的
[解决办法]
VB code
'引用了Microsoft VBScript Regular Expressions 5.5Private Sub Command1_Click() Dim re As RegExp    Dim mh As Match    Dim mhs As MatchCollectionText1.Text = ""Source1 = Inet1.OpenURL("www.csdn.net")If Source1 <> "" ThenText1.Text = Source1Me.Inet1.CancelEnd IfSet re = New RegExpre.Global = Truere.Pattern = "http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"Set mhs = re.Execute(Source1)For Each mh In mhsDebug.Print mhNextEnd Sub
[解决办法]
可以用正则分析源代码,也可以直接用DOM遍历.

热点排行