vb中inte获取网页内的所有超链接
vb中inte获取网页内的所有超链接 怎么获取
包括正则转化
不要用webbrower
我想它实现以下功能
输入网站 自动检索所有链接,并输出所有超链接
判断错误网页 生成地图
以下下载的源码 不是很懂 希望高手解释下
新手 不在乎分 下面代码有错误
Private Sub cmdExit_Click()
Unload Me
End Sub
Private Sub cmdGoWander_Click()
StopSearching = False
rtbLinkNames.Text = ""
On Error Resume Next
Close #1
On Error GoTo 0
' Open CurDir & "\HTTPFILE.TXT" For Append As #1
Current_Pos = 1
Me.MousePointer = vbArrowHourglass
cmdGoWander.Enabled = False
cmdStopWandering.Enabled = True
If Get_File(txtSite.Text) Then
If Not Parse Then
Me.MousePointer = vbNormal
cmdGoWander.Enabled = True
cmdStopWandering.Enabled = False
Exit Sub
End If
Else
Me.MousePointer = vbNormal
MsgBox "Unable to find the desired site."
End If
cmdGoWander.Enabled = True
cmdStopWandering.Enabled = False
Me.MousePointer = vbNormal
StopSearching = True
End Sub
Private Sub cmdStopWandering_Click()
StopSearching = True
End Sub
Private Sub Form_Load()
txtSite.Text = "http://www.jamsa.com/jamsa1.htm"
NewLine = Chr(13) & Chr(10)
StopSearching = True
cmdStopWandering.Enabled = False
End Sub
Private Sub Form_QueryUnload(Cancel As Integer, UnloadMode As Integer)
Dim Response As Integer
If Not StopSearching Then
Response = MsgBox("Stop search and lose updated file?", vbYesNo)
If Response = vbNo Then
Cancel = 1
Exit Sub
End If
Else
Response = MsgBox("Close program?", vbYesNo)
If Response = vbNo Then
Cancel = 1
Exit Sub
End If
End If
itcWander.Cancel
End Sub
Private Sub Form_Unload(Cancel As Integer)
Unload frmSearching
Unload frmParsing
End Sub
Private Sub itcWander_StateChanged(ByVal State As Integer)
Dim strMess As String ' Message variable.
DoEvents
Select Case State
Case 1 'Resolving host
Connected = False
Case icConnected
Connected = True
Case State = icError
strMess = "ErrorCode: " & itcWander.ResponseCode & " : " & itcSearch.ResponseInfo
MsgBox strMess, vbOKOnly
itcWanderSearchError = True
End Select
End Sub
Option Explicit
Option Compare Text
Const TAG_LENGTH% = 1000
Const OUT_FILE = "\taglist.txt"
Public Current_Pos As Long
Public Tag As String
Public Real_File_Name As String
Public File_Name As String
Public Site As String
Public Location As String
Public Site_Length As Integer
Public NewLine As String
Public SiteContents As String
Public inetSearchError As Boolean, StopSearching As Boolean
Public Function TrimPage(ByVal Address As String) As String
Do While Right$(Address, 1) <> "/"
Address = Left$(Address, Len(Address) - 1)
Loop
TrimPage = Address
End Function
Private Function ResolvedSite(FileAddr As String, Parent As String, NewTag As String) As Boolean
'On Error GoTo ResolveError
ResolvedSite = True
Parent = FileAddr
If Right$(Parent, 1) <> "/" Then
Parent = TrimPage(Parent)
End If
If Left$(NewTag, 3) <> "../" And Left$(NewTag, 5) <> "http:" Then
Exit Function
End If
If Left$(NewTag, 6) = "http:/" And Left$(NewTag, 7) <> "http://" Then
NewTag = Right$(NewTag, Len(NewTag) - 6)
End If
Do While Left$(NewTag, 3) = "../"
NewTag = Right$(NewTag, Len(NewTag) - 3)
Parent = Left(Parent, Len(Parent) - 1)
Do While Right$(Parent, 1) <> "/"
Parent = Left$(Parent, Len(Parent) - 1)
Loop
Loop
Exit Function
ResolveError:
ResolvedSite = False
MsgBox "Unable to resolve parent site!"
End Function
Public Function Get_File(ByVal txtURL As String) As Boolean
frmSearching.Hide
frmSearching.lblSite.Caption = txtURL
If Len(txtURL) > 40 Then
frmSearching.lblSite.Width = Len(txtURL) * 73
frmSearching.lblCaption.Width = frmSearching.lblSite.Width
frmSearching.Width = frmSearching.lblCaption.Width + 435
End If
frmSearching.Show
DoEvents
Real_File_Name = txtURL
Site = Real_File_Name
Site_Length = Len(Site)
inetSearchError = False
frmWanderer.itcWander.RequestTimeout = 60
frmWanderer.itcWander.AccessType = icUseDefault
On Error Resume Next
SiteContents = frmWanderer.itcWander.OpenURL(txtURL, icString)
Unload frmSearching
DoEvents
If Err.Number <> 0 And Not inetSearchError Then
Get_File = False
Exit Function
End If
Get_File = True
End Function
Public Sub AddLink(LinktoAdd As String)
Dim FoundPos As Integer
FoundPos = 0
FoundPos = frmWanderer.rtbLinkNames.Find(LinktoAdd, FoundPos)
If FoundPos <> -1 Then 'the phrase was found.
Exit Sub
Else
frmWanderer.rtbLinkNames.Text = frmWanderer.rtbLinkNames.Text & LinktoAdd & NewLine
End If
End Sub
Public Function Parse() As Boolean
Dim PositionInString As Long, Response As Integer, ThisLinkLength As Integer
Dim End_Of_List As Boolean, NewFileName As String, GotFile As Boolean, Parent As String
Dim Done As Boolean, Tag As String, lclTag As String, AddToFileString As String, RelativeAddress As Boolean
Dim lclTag_Length As Integer, I As Integer, FirstQuote As Integer, SecondQuote As Integer
End_Of_List = False
PositionInString = 0
Done = False
If Not Initialize_OutputFile() Then Exit Function
Do While Not End_Of_List And Not StopSearching
Current_Pos = 1
Done = Get_Tag(Tag)
Do While Not Done And Not StopSearching
frmParsing.Show
DoEvents
lclTag = Tag
lclTag_Length = Len(Tag)
FirstQuote = 0
SecondQuote = 0
If InStr(lclTag, "href") Then
Do While Left$(lclTag, 4) <> "href"
lclTag = Right$(lclTag, Len(lclTag) - 1)
Loop
If Not InStr(lclTag, "::") Then
RelativeAddress = True
Else
RelativeAddress = False
End If
For I = 1 To lclTag_Length
If Mid$(lclTag, I, 1) = Chr(34) Then
If FirstQuote <> 0 Then
SecondQuote = I
Exit For
Else
FirstQuote = I + 1
End If
End If
Next
AddToFileString = Mid$(lclTag, FirstQuote, SecondQuote - FirstQuote)
If InStr(AddToFileString, "://") Then
AddLink (AddToFileString)
Else
If Not ResolvedSite(Site, Parent, AddToFileString) Then
frmParsing.Hide
MsgBox "Unable to resolve site!"
Else
AddLink (Parent & AddToFileString)
End If
End If
End If
Done = Get_Tag(Tag)
DoEvents
Loop
frmParsing.Hide
If Done Then
If Len(frmWanderer.rtbLinkNames.Text) > 0 Then frmWanderer.rtbLinkNames.SaveFile App.Path & OUT_FILE, rtfText
GotFile = False
Else
Response = MsgBox("Are you sure you want to stop search?", vbYesNo)
If Response = vbYes Then
frmWanderer.rtbLinkNames.SaveFile App.Path & OUT_FILE, rtfText
frmWanderer.itcWander.Cancel
Parse = Not StopSearching
Exit Function
End If
End If
DoEvents
Do Until GotFile Or StopSearching
If PositionInString < Len(frmWanderer.rtbLinkNames.Text) Then
ThisLinkLength = 0
If PositionInString = 0 Then PositionInString = 1
Do While Mid$(frmWanderer.rtbLinkNames.Text, PositionInString + ThisLinkLength, 1) <> Chr(10)
ThisLinkLength = ThisLinkLength + 1
DoEvents
Loop
NewFileName = Mid$(frmWanderer.rtbLinkNames.Text, PositionInString, ThisLinkLength - 1)
If Left$(NewFileName, 6) <> "mailto" Then
PositionInString = PositionInString + ThisLinkLength + 1
ThisLinkLength = 0
If Not Get_File(NewFileName) Then
MsgBox "Error opening page. Moving on to next page. Bad page = " & NewFileName
GotFile = False
Else
GotFile = True
End If
Else
GotFile = False
End If
Else
GotFile = True
End_Of_List = True
End If
DoEvents
Loop
' frmWanderer.rtbLinkNames.Text = AddToFileString
Loop
Parse = Not StopSearching
End Function
Public Function Get_Tag(ReturnTag As String) As Boolean
ReturnTag = ""
Get_Tag = False
Do While Current_Pos < Len(SiteContents)
If Mid(SiteContents, Current_Pos, 1) = " <" And Mid(SiteContents, Current_Pos + 1, 1) = "A" Then
Dim Local_I As Integer
Local_I = 1
Do While Mid(SiteContents, Current_Pos + Local_I, 1) <> ">"
If Local_I < TAG_LENGTH Then
ReturnTag = ReturnTag & Mid(SiteContents, Current_Pos + Local_I, 1)
End If
Local_I = Local_I + 1
Loop
Current_Pos = Current_Pos + Local_I
Exit Function
End If
Current_Pos = Current_Pos + 1
Loop
Get_Tag = True
End Function
Public Function Initialize_OutputFile() As Boolean
If Dir(App.Path & OUT_FILE) <> "" Then
On Error Resume Next
Kill App.Path & OUT_FILE
If Err.Number <> 0 Then
MsgBox "Unable to open output file.", vbCritical
Initialize_OutputFile = False
Exit Function
End If
End If
Open App.Path & OUT_FILE For Append As #1
Close #1
Initialize_OutputFile = True
Exit Function
End Function
'引用了Microsoft VBScript Regular Expressions 5.5Private Sub Command1_Click() Dim re As RegExp Dim mh As Match Dim mhs As MatchCollectionText1.Text = ""Source1 = Inet1.OpenURL("www.csdn.net")If Source1 <> "" ThenText1.Text = Source1Me.Inet1.CancelEnd IfSet re = New RegExpre.Global = Truere.Pattern = "http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"Set mhs = re.Execute(Source1)For Each mh In mhsDebug.Print mhNextEnd Sub
[解决办法]
可以用正则分析源代码,也可以直接用DOM遍历.