![](/img/trans.png)
[英]Optimizing VBA / Excel Macro Code (Finding Duplicates and Sorting Large Data Set )
[英]Optimizing VBA / Excel Macro Code (Finding Duplicates in Large Sheet)
除了几年前在 VB 中做的一些小事情之外,我真的从来没有用 VBA 或任何类似的东西编码过。 这是我尝试编写一些代码来搜索客户帐户的 Excel 表数据库并搜索可能的重复帐户。 遗憾的是,在我需要运行它的机器上,它只能处理大约 3,500 个条目而不会导致 Excel 崩溃。 这导致我的代码严重未优化以及机器运行缓慢。
可以做些什么来优化以下代码,以及我将来应该使用 VBA 中的哪些最佳实践?
'Essentially, this loops through each row in the sheet
'For each row, it loops through every row after it, searching for duplicates of itself (skipping over a rows that have previously been marked as duplicates)
'Duplicates are defined by entries that meet a 'threshhold' of similarity
'The threshhold is defined as the number '5', first and last names are each two points, address and email address are one point
'That means that in order for an entry to meet the thresshold, the first and last name must be the same, and it must also have either the same address or email
'When duplicates are found, the duplicate column is marked as 'Yes' for that row, and the first occurence column is marked with a number defining the row number where the account first appeared
Sub Main():
Dim lNameCol, fNameCol, addressCol, emailCol, duplicateCol, fOccurenceCol As String
'Defines the column letters for the various data fields
lNameCol = "A"
fNameCol = "B"
addressCol = "C"
emailCol = "D"
duplicateCol = "E" 'The column where a entry/row will be marked as being a duplicate
fOccurenceCol = "F" 'The column that contains the row number where a duplicate accounts first occurence was found
Call Duplicates(lNameCol, fNameCol, addressCol, emailCol, duplicateCol, fOccurenceCol)
End Sub
'Gets number of rows in currently active sheet
Function RowCount():
Application.ActiveSheet.UsedRange
RowCount = Worksheets("Sheet1").UsedRange.Rows.Count
End Function
'Finds and labels duplicates
Sub Duplicates(ByVal lNameCol As String, ByVal fNameCol As String, ByVal addressCol As String, ByVal emailCol As String, ByVal duplicateCol As String, ByVal fOccurenceCol As String)
Dim lRowCount As Integer
lRowCount = RowCount()
'Loops through each row in the sheet
For i = 1 To lRowCount
Dim duplicate, lastName, firstName, email, address As String
'Sets these variables' values corresponding cell value in row 'i'
'UCase capitilizes things to make entries case-insensitive
duplicate = UCase(Range(duplicateCol & i).Value)
lastName = UCase(Range(lNameCol & i).Value)
firstName = UCase(Range(fNameCol & i).Value)
email = UCase(Range(emailCol & i).Value)
address = UCase(Range(addressCol & i).Value)
'Checks to make sure row has not already been marked a duplicate, if it hasn't it continues
If (StrComp(duplicate = "YES", vbTextCompare) = 1) Then
'Loops through every row after the current row (row 'i')
For n = (i + 1) To lRowCount
'duplicateThreshold is an integer that defines the threshhold of similarity that rows need to have in order to be labeled a duplicate
Dim duplicateThreshhold As Integer
Dim lastName2, firstName2, email2, address2 As String
duplicateThreshhold = 0
'These are the entry variables for account entry at row 'n' being compared to the account entry at row 'i'
lastName2 = UCase(Range(lNameCol & n).Value)
firstName2 = UCase(Range(fNameCol & n).Value)
email2 = UCase(Range(emailCol & n).Value)
address2 = UCase(Range(addressCol & n).Value)
'Adds 2 points to threshhold if first name is the same
If lastName = lastName2 Then
duplicateThreshhold = duplicateThreshhold + 2
End If
'Adds 2 points to threshold if last name is the same
If firstName = firstName2 Then
duplicateThreshhold = duplicateThreshhold + 2
End If
'The remaining two fields give 1 point each to the thresshold
'As long as the sum of the points given by first and last name is always greater than half of the threshhold, first and last name will always be required
If email = email2 Or address = address2 Then
duplicateThreshhold = duplicateThreshhold + 1
End If
If duplicateThreshhold > 4 Then
'Labels duplicate entries as duplicates
Range(duplicateCol & i).Value = "Yes"
Range(duplicateCol & n).Value = "Yes"
'Labels duplicate entries with the first occurence of that entry
Range(fOccurenceCol & i).Value = i 'Labels first occurence account's row number
Range(fOccurenceCol & n).Value = i
End If
Next
End If
Next
End Sub
好的,这是我脑后的问题之一,所以我必须解决它(非常感谢@RJGordon!)。 我最终以两种不同的方式解决它 - 第一种使用嵌套循环,第二种使用散列字典。 第二个是一个更干净和更快的算法,但为了彻底起见,我将两者都呈现出来。
嵌套循环
正如@JohnColeman 指出的那样,这种方法在逻辑上是有道理的,但扩展性很差。 为每条记录提供所有重复行的列表很容易,并且具有标记数据集中第一行的优势。 (下面的第二个解决方案不会用下面的重复项标记初始记录,但如果需要,您也可以解决这个问题。)
Option Explicit
Sub test()
MarkDuplicates ActiveSheet, 1, 2, 3, 4, 5, 6
End Sub
Sub MarkDuplicates(sh As Worksheet, lNameCol As Long, _
fNameCol As Long, addressCol As Long, _
emailCol As Long, duplicateCol As Long, _
fOccuranceCol As Long)
Dim lastRow As Long
Dim lastCol As Long
Dim acctRange As Range
Dim acctData As Variant
Dim checkRow As Long
Dim otherRow As Long
Dim dupScore As Integer
Dim dupList As String
'--- determine the range of data and copy to a memory-based array
lastRow = sh.Cells(sh.Rows.Count, "A").End(xlUp).Row
lastCol = sh.Cells(1, sh.Columns.Count).End(xlToLeft).Column
Set acctRange = sh.Range("A1").Resize(lastRow, lastCol)
acctData = acctRange
'--- nested loop to check each row against every other row
For checkRow = 2 To lastRow
dupList = ""
For otherRow = 2 To lastRow
dupScore = 0
If otherRow <> checkRow Then
If acctData(checkRow, lNameCol) = acctData(otherRow, lNameCol) Then
dupScore = dupScore + 2
End If
If acctData(checkRow, fNameCol) = acctData(otherRow, fNameCol) Then
dupScore = dupScore + 2
End If
If acctData(checkRow, addressCol) = acctData(otherRow, addressCol) Then
dupScore = dupScore + 1
End If
If acctData(checkRow, emailCol) = acctData(otherRow, emailCol) Then
dupScore = dupScore + 1
End If
If dupScore > 4 Then
dupList = dupList & otherRow & ","
End If
End If
Next otherRow
If Len(dupList) > 0 Then
dupList = Left(dupList, Len(dupList) - 1)
acctData(checkRow, duplicateCol) = "Yes"
acctData(checkRow, fOccuranceCol) = dupList
Else
acctData(checkRow, duplicateCol) = ""
acctData(checkRow, fOccuranceCol) = ""
End If
Next checkRow
'--- copy the array back to the worksheet
acctRange = acctData
Set sh = Nothing
End Sub
使用字典
我的意思是字典(复数)。 因为可以使用三种不同的字段组合来达到您的重复分数阈值,所以您的字典哈希必须测试每个组合。 我选择的字典键(散列)是字段的串联字符串,在测试时,将指示重复记录。 此解决方案仅显示包含三个字典的单个循环。 如果您想要找到所有重复记录的列表,则重写代码以在单个循环中创建所有三个字典,然后针对每个字典键对每个记录使用单独的(非嵌套)循环,并保留一个正在运行的重复列表。 (为了效率,我将它保持在一个循环中。)
使用更长的键(例如姓氏+名字+地址+电子邮件)创建单个字典将导致您对所有这些字段重复的记录发生键冲突,但您仍然需要找到一种方法来测试其他组合。 比我聪明得多的人可能会想出更简单的方法。
Option Explicit
Sub test()
MarkDuplicates ActiveSheet, 1, 2, 3, 4, 5, 6
End Sub
Sub MarkDuplicates(sh As Worksheet, lNameCol As Long, _
fNameCol As Long, addressCol As Long, _
emailCol As Long, duplicateCol As Long, _
fOccuranceCol As Long)
Dim lastRow As Long
Dim lastCol As Long
Dim acctRange As Range
Dim acctData As Variant
Dim acctDict1 As Dictionary
Dim acctDict2 As Dictionary
Dim acctDict3 As Dictionary
Dim acctKey As String
Dim checkRow As Long
Dim otherRow As Long
Dim dupScore As Integer
Dim dupList As String
'--- determine the range of data and copy to a memory-based array
lastRow = sh.Cells(sh.Rows.Count, "A").End(xlUp).Row
lastCol = sh.Cells(1, sh.Columns.Count).End(xlToLeft).Column
Set acctRange = sh.Range("A1").Resize(lastRow, lastCol)
acctData = acctRange
Set acctDict1 = New Dictionary
Set acctDict2 = New Dictionary
Set acctDict3 = New Dictionary
'--- build the initial dictionary
' for the key to trip as duplicate, there are three possible
' combinations to check, so we make three dictionaries and
' create keys as combinations of the fields
For checkRow = 2 To lastRow
'--- clear previous flags
acctData(checkRow, duplicateCol) = ""
acctData(checkRow, fOccuranceCol) = ""
'--- dupe is lastname + firstname
acctKey = acctData(checkRow, lNameCol) & acctData(checkRow, fNameCol)
If Not acctDict1.Exists(acctKey) Then
acctDict1.Add acctKey, checkRow
ElseIf acctData(checkRow, duplicateCol) <> "Yes" Then
acctData(checkRow, duplicateCol) = "Yes1"
acctData(checkRow, fOccuranceCol) = acctDict1.Item(acctKey)
End If
'--- dupe is lastname + address + email
acctKey = acctData(checkRow, lNameCol) & acctData(checkRow, addressCol) & _
acctData(checkRow, emailCol)
If Not acctDict2.Exists(acctKey) Then
acctDict2.Add acctKey, checkRow
ElseIf acctData(checkRow, duplicateCol) <> "Yes" Then
acctData(checkRow, duplicateCol) = "Yes2"
acctData(checkRow, fOccuranceCol) = acctDict2.Item(acctKey)
End If
'--- dupe is firstname + address + email
acctKey = acctData(checkRow, fNameCol) & acctData(checkRow, addressCol) & _
acctData(checkRow, emailCol)
If Not acctDict3.Exists(acctKey) Then
acctDict3.Add acctKey, checkRow
ElseIf acctData(checkRow, duplicateCol) <> "Yes" Then
acctData(checkRow, duplicateCol) = "Yes3"
acctData(checkRow, fOccuranceCol) = acctDict3.Item(acctKey)
End If
Next checkRow
'--- copy the array back to the worksheet
acctRange = acctData
Set sh = Nothing
End Sub
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.