[英]Optimizing Big Data Comparison Using For Loops
好的,這是交易人員。 我有two text files
。 每個包含500
行(句子)。
我已將它們加載到memory
形成自己的數組(數據類型:字符串)。 我們將array A & B
命名為array A & B
接下來,我得到array A
的first sentence
,使用SPACE
作為分隔符將其拆分為另一個array C
,以便得到單詞。
然后對於array B
每個句子,我再次使用SPACE
作為分隔符將其分成array D
以獲得單詞,並將array C
中的每個單詞與array D
每個單詞進行比較,計算兩個句子之間的百分比匹配。
我計算在第一個句子的平均百分比匹配array A
針對所有的句子array B
。
然后我將它存儲到一個Array E
,其中包含array A
所有句子及其平均匹配百分比。
對於數組A中的每個標題,我用上面的first sentence
做我的工作。
問題是,處理陣列A中的每個標題大約需要15秒。無論如何,我可以優化這個時間來加快速度嗎?
硬件: AMD Phenom I 32位四核
碼:
Imports System.IO
Imports System.Object
Imports System.Xml
Imports System.Text.RegularExpressions
Module Module1
Sub Main()
'Important File Paths
Dim titlesFilePath As String = Environment.CurrentDirectory & "\titles.txt"
Dim xmlTitlesFilePath As String = Environment.CurrentDirectory & "\extractedTitles.txt"
Dim stopWordsFilePath As String = Environment.CurrentDirectory & "\stopWords.txt"
'Import Important Data From Files -> Memory
Dim titles As Array = FileToArray(titlesFilePath)
Dim stopWords As Array = FileToArray(stopWordsFilePath)
Dim xmlDataUnprocessed As Array = FileToArray(xmlTitlesFilePath)
'Delimters To Filter Titles For
Dim userDefinedDelimeters(4, 1)
userDefinedDelimeters(0, 0) = "-"
userDefinedDelimeters(0, 1) = " "
userDefinedDelimeters(1, 0) = ","
userDefinedDelimeters(1, 1) = " "
userDefinedDelimeters(2, 0) = "—"
userDefinedDelimeters(2, 1) = " "
userDefinedDelimeters(3, 0) = "'s"
userDefinedDelimeters(3, 1) = ""
userDefinedDelimeters(4, 0) = "'"
userDefinedDelimeters(4, 1) = " "
'Declare Important Variables
Dim xmlData(xmlDataUnprocessed.Length / 2, 1)
Dim xmlTurn = 0
Dim xmlDataCount = 0
'Create Feed Title/URL Array
For i = 0 To (xmlDataUnprocessed.Length - 1)
If xmlTurn = 0 Then
xmlData(xmlDataCount, 0) = xmlDataUnprocessed(i)
xmlTurn = 1
Else
xmlData(xmlDataCount, 1) = xmlDataUnprocessed(i)
xmlTurn = 0
xmlDataCount += 1
End If
Next
'CPU-Intensive Stuff Occurs
Dim xmlTitle As String
Dim xmlTitleWords As Array
Dim savedTitleWords As Array
Dim titleResults(xmlData.GetUpperBound(0) - 1, 1)
Dim titlePercentageMatch As Integer
Dim numberOfTitlesMatched As Integer
For i = 0 To xmlData.GetUpperBound(0) - 1
Console.WriteLine("Working On Title No. " & i & " Out Of " & xmlData.GetUpperBound(0) - 1)
titlePercentageMatch = 0
numberOfTitlesMatched = 0
xmlTitle = xmlData(i, 0)
xmlTitle = processTitle(stopWords, userDefinedDelimeters, xmlTitle)
xmlTitleWords = xmlTitle.Split(" ")
For Each title In titles
title = processTitle(stopWords, userDefinedDelimeters, title)
savedTitleWords = title.split(" ")
Dim compareResult = compareTitle(xmlTitleWords, savedTitleWords)
If compareResult > 0 Then
titlePercentageMatch += compareResult
numberOfTitlesMatched += 1
End If
Next
titleResults(i, 0) = xmlData(i, 0)
titleResults(i, 1) = (titlePercentageMatch / numberOfTitlesMatched)
Next
For i = 0 To titleResults.GetUpperBound(0) - 1
Console.WriteLine(titleResults(i, 0) & " ---> " & titleResults(i, 1) & vbCrLf)
Next
Console.Read()
End Sub
Function compareTitle(ByRef xmlTitleWords As Array, ByRef savedTitleWords As Array)
Dim NumberOfMatches = 0
For Each xmlWord In xmlTitleWords
For Each savedWord In savedTitleWords
If (xmlWord.ToString.ToLower = savedWord.ToString.ToLower) Then
NumberOfMatches += 1
End If
Next
Next
Return ((NumberOfMatches / xmlTitleWords.Length) * 100)
End Function
Function processTitle(ByRef stopWordArray As Array, ByRef delimArray As Array, ByVal title As String)
title = removeStopWords(stopWordArray, title)
title = removeDelims(delimArray, title)
Return title
End Function
Function removeStopWords(ByRef stopWordsArray As Array, ByVal sentence As String)
For i = 0 To stopWordsArray.Length - 1
If sentence.ToLower.Contains(" " & stopWordsArray(i).ToString.ToLower & " ") = True Then
sentence = Microsoft.VisualBasic.Strings.Replace(sentence, " " & stopWordsArray(i) & " ", " ", 1, -1, Constants.vbTextCompare)
'ElseIf sentence.ToLower.Contains(stopWordsArray(i).ToString.ToLower & " ") = True Then
'sentence = Microsoft.VisualBasic.Strings.Replace(sentence, stopWordsArray(i) & " ", "", 1, -1, Constants.vbTextCompare)
End If
sentence = Regex.Replace(sentence, "\s+", " ")
Dim Words = sentence.ToLower.Split(" ")
If Words(0).ToString.ToLower & " " = stopWordsArray(i).ToString.ToLower & " " Then
sentence = sentence.Remove(0, stopWordsArray(i).ToString.ToLower.Length + 1)
End If
Words = sentence.ToLower.Split(" ")
Dim LastWord = Words(Words.Length - 1)
'Console.WriteLine(LastWord & "++")
If " " & LastWord.ToString.ToLower = " " & stopWordsArray(i).ToString.ToLower Then
sentence = sentence.Remove(sentence.Length - 1 - LastWord.Length, stopWordsArray(i).ToString.ToLower.Length + 1)
End If
Next
sentence = Regex.Replace(sentence, "\s+", " ")
Return sentence
End Function
Function removeDelims(ByRef delimArray As Array, ByVal sentence As String)
For i = 0 To delimArray.GetUpperBound(0) - 1
sentence = sentence.Replace(delimArray(i, 0), delimArray(i, 1))
Next
sentence = Regex.Replace(sentence, "\s+", " ")
Return sentence
End Function
Function FileToArray(ByVal filePath As String) As String()
Dim content As String
Dim lines As New ArrayList
Dim sr As System.IO.StreamReader
' read the file's lines into an ArrayList
Try
sr = New System.IO.StreamReader(filePath)
Do While sr.Peek() >= 0
lines.Add(sr.ReadLine())
Loop
Finally
If Not sr Is Nothing Then sr.Close()
End Try
' convert from ArrayList to a String array
Return CType(lines.ToArray(GetType(String)), String())
End Function
End Module
編輯:我希望它不會太混亂。 對於那個很抱歉! 編輯2:提供醬:P
你的基本算法是N * M * A 2
如果您有500 * 500 * 5 2,那么您將推動6,250,000個不區分大小寫的字符串比較。 但那就是你所做的一切。 你的內部循環通過外循環的長度為每個title
調用processTitle
。 它不需要那樣做。
你可以做的是有一個預處理步驟,用表示該單詞的整數(符號)替換每個單詞。 為此,您將使用字典查找符號,如果沒有,則分配新的唯一符號(例如,保留整數計數器並使用下一個值)。
然后,您的主處理循環將與之前的類似,但您需要進行整數比較(更快)。 實際上,您希望此處理步驟僅進行比較和統計信息收集。 其他一切都應該搬出去。
保持預處理步驟。
並行化您的處理步驟。 一種方法是使用Parallel.For()作為最外層循環: Parallel.For(0, xmlData.GetUpperBound(0) - 1, Sub(i) ... End Sub)
其中action是你的循環體從上面。 TPL可能會很好地平衡負載(均勻地使用4個內核)。
另一種方法是使用任務並行庫來啟動對1/4數據進行操作的任務。 然后開始使用結果的延續。
一次只讀取array B
到存儲器,然后逐行讀取array A
適用時使用StringBuilder而不是string。
另外,看看你是否可以使用並行處理,即:任務。
至於距離,匹配算法,你沒有提到它是什么以及它是如何做的。 很難說出來。
你用一些東西放慢了自己的速度......
If (xmlWord.ToString.ToLower = savedWord.ToString.ToLower) Then
為什么不把所有內容改成小寫,甚至將它放入陣列?
title = processTitle(stopWords, userDefinedDelimeters, title)
為什么不把它加載到數組之前呢?
提前完成所有處理,然后單獨進行比較。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.