VB 在大批量数据内去重(上百万条记录)
要对上百万条记录进行去重 有的还需要统计重复的个数
现在的去重方法是
方法一:
Set fs = CreateObject("Scripting.FileSystemObject") Set ts1 = fs.OpenTextFile("D:\b.txt", 1, True) ipstring1 = "" ipstring2 = "" Set f = fs.CreateTextFile("d:\d.txt", True) Do While Not ts1.AtEndOfStream ipstring1 = ts1.readline Set ts2 = fs.OpenTextFile("D:\d.txt", 1, True) If Not ts2.AtEndOfStream Then ipstring2 = TextGetByFile(ts2.ReadAll) If InStr(ipstring2, ipstring1) > 0 Then 'If TimesInStr(ipstring2, ipstring1) > 0 Then Else f.WriteLine (ipstring1) End If Else f.WriteLine (ipstring1) End If ts2.Close Set ts2 = Nothing Loop
Set fs = CreateObject("Scripting.FileSystemObject") Set ts1 = fs.OpenTextFile("d:\b.txt", 1, True) ipstring1 = "" ipstring2 = "" Set f = fs.CreateTextFile("d:\dd.txt", True) Dim regEx, strTemp ' 建立变量。 Set regEx = New RegExp ' 建立正则表达式。 Dim blnMatche regEx.IgnoreCase = False ' 设置是否区分字符大小写。 regEx.Global = False ' 设置全局可用性。 regEx.MultiLine = True Do While Not ts1.AtEndOfStream ipstring1 = ts1.readline Set ts2 = fs.OpenTextFile("d:\dd.txt", 1, True) If Not ts2.AtEndOfStream Then ipstring2 = ts2.ReadAll regEx.Pattern = ipstring1 & "$" If regEx.test(ipstring2) Then Else f.WriteLine (ipstring1) End If Else f.WriteLine (ipstring1) End If ts2.Close Set ts2 = Nothing Loop
Private Sub Command2_Click() Dim col As Collection Set col = New Collection Dim stream1 As TextStream Set stream1 = fso.OpenTextFile(fso.BuildPath(App.Path, "test.txt"), ForReading, False) Dim stream2 As TextStream Set stream2 = fso.OpenTextFile(fso.BuildPath(App.Path, "output.txt"), ForWriting, True) While Not stream1.AtEndOfStream Dim strLine As String strLine = stream1.ReadLine Dim vntValue As Variant vntValue = Empty ' 获取关键字对应的节点对象 On Error Resume Next vntValue = col.Item(strLine) On Error GoTo 0 If IsEmpty(vntValue) Then Call col.Add(Null, strLine) Call stream2.WriteLine(strLine) End If Wend Call stream1.Close Call stream2.CloseEnd Sub