VB 在大批量数据内去重(上百万条记录)
本帖最后由 bcrun 于 2010-09-21 17:34:06 编辑 要对上百万条记录进行去重 有的还需要统计重复的个数
现在的去重方法是
方法一:
Set fs = CreateObject("Scripting.FileSystemObject")
Set ts1 = fs.OpenTextFile("D:\b.txt", 1, True)
ipstring1 = ""
ipstring2 = ""
Set f = fs.CreateTextFile("d:\d.txt", True)
Do While Not ts1.AtEndOfStream
ipstring1 = ts1.readline
Set ts2 = fs.OpenTextFile("D:\d.txt", 1, True)
If Not ts2.AtEndOfStream Then
ipstring2 = TextGetByFile(ts2.ReadAll)
If InStr(ipstring2, ipstring1) > 0 Then
'If TimesInStr(ipstring2, ipstring1) > 0 Then
Else
f.WriteLine (ipstring1)
End If
Else
f.WriteLine (ipstring1)
End If
ts2.Close
Set ts2 = Nothing
Loop
Set fs = CreateObject("Scripting.FileSystemObject")
Set ts1 = fs.OpenTextFile("d:\b.txt", 1, True)
ipstring1 = ""
ipstring2 = ""
Set f = fs.CreateTextFile("d:\dd.txt", True)
Dim regEx, strTemp ' 建立变量。
Set regEx = New RegExp ' 建立正则表达式。
Dim blnMatche
regEx.IgnoreCase = False ' 设置是否区分字符大小写。
regEx.Global = False ' 设置全局可用性。
regEx.MultiLine = True
Do While Not ts1.AtEndOfStream
ipstring1 = ts1.readline
Set ts2 = fs.OpenTextFile("d:\dd.txt", 1, True)
If Not ts2.AtEndOfStream Then
ipstring2 = ts2.ReadAll
regEx.Pattern = ipstring1 & "$"
If regEx.test(ipstring2) Then
Else
f.WriteLine (ipstring1)
End If
Else
f.WriteLine (ipstring1)
End If
ts2.Close
Set ts2 = Nothing
Loop
Private Sub Command2_Click()
Dim col As Collection
Set col = New Collection
Dim stream1 As TextStream
Set stream1 = fso.OpenTextFile(fso.BuildPath(App.Path, "test.txt"), ForReading, False)
Dim stream2 As TextStream
Set stream2 = fso.OpenTextFile(fso.BuildPath(App.Path, "output.txt"), ForWriting, True)
While Not stream1.AtEndOfStream
Dim strLine As String
strLine = stream1.ReadLine
Dim vntValue As Variant
vntValue = Empty
' 获取关键字对应的节点对象
On Error Resume Next
vntValue = col.Item(strLine)
On Error GoTo 0
If IsEmpty(vntValue) Then
Call col.Add(Null, strLine)
Call stream2.WriteLine(strLine)
End If
Wend
Call stream1.Close
Call stream2.Close
End Sub