[英]Haskell version of Peter Norvig's spelling corrector is unbelievably slow
作為練習,我在Haskell中編寫了Peter Norvig的拼寫校正器算法 :
module Spl (nwords, correct)
where
import Data.Char (toLower)
import Data.Ord (comparing)
import Data.List (maximumBy, splitAt, foldl')
import Text.Regex.TDFA (getAllTextMatches, (=~))
import qualified Data.Set as Set
import qualified Data.Map.Strict as Map
type NWords = Map.Map String Int
alphabet :: String
alphabet = enumFromTo 'a' 'z'
nwords :: String -> Map.Map String Int
nwords = train . words'
uniqueElems :: Ord a => [a] -> [a]
uniqueElems = uniq' Set.empty
where uniq' _ [] = []
uniq' seen (x:xs)
| x `Set.member` seen = uniq' seen xs
| otherwise = x:uniq' (x `Set.insert` seen) xs
words' :: String -> [String]
words' = getAllTextMatches . flip (=~) "[a-z]+" . map toLower
train :: [String] -> NWords
train = foldl' populate Map.empty
where populate m feature = Map.insertWith (+) feature 1 m
edits :: String -> [String]
edits word = uniqueElems $ concat [dels, trans, repl, ins]
where dels = [a ++ tail b | (a,b) <- splits, nn b]
trans = [ a ++ (b!!1):head b:tail (tail b) | (a,b) <- splits
, length b > 1]
repl = [a ++ c:tail b | (a,b) <- splits, c <- alphabet, nn b]
ins = [a ++ c:b | (a,b) <- splits, c <- alphabet]
splits = [splitAt n word | n <- [0..length word]]
nn = not . null
knownEdits :: NWords -> String -> [String]
knownEdits nw word = uniqueElems [ e2 | e1 <- edits word, e2 <- edits e1
, Map.member e2 nw]
known :: NWords -> [String] -> [String]
known nw = uniqueElems . filter (`Map.member` nw)
correct :: NWords -> String -> String
correct nw word = fst $ maximumBy (comparing snd) candidates
where candidates = [(w, Map.findWithDefault 0 w nw) | w <- result]
result = head $ filter (not . null) start
start = [ known nw [word], known nw $ edits word
, knownEdits nw word , [word]]
用法
這就是我將如何使用它:
ghci> t <- readFile "big.txt"
ghci> let nw = nwords t
ghci> correct nw "speling"
"spelling"
可以在Peter Norvig的網站上找到big.txt
文件(直接鏈接為6.2MB): http : //norvig.com/big.txt
問題
train
功能比words'
慢得多,因此是瓶頸。 那么,我在哪里弄錯了? 我在某處有內存泄漏嗎?
我的主要建議是:
Data.HashMap.Strict
以下代碼可以在大約2秒鍾內將所有big.txt
加載到Data.Hashmap.Strict
中。 內存使用量約為25 MB(在64位系統上):
import qualified Data.HashMap.Strict as H
import qualified Data.Text as T
import qualified Data.Text.IO as T
import Data.List
isAlpha ch = ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z')
wrds :: T.Text -> [ T.Text ]
wrds bs =
let
(_, r1) = T.span (not . isAlpha) bs
(w, r2) = T.span isAlpha r1
in if T.null w then [] else T.toLower w : wrds r2
readDict = do
allwords <- fmap wrds $ T.readFile "big.txt"
let h = foldl' add H.empty all words
add h w = let c = H.lookupDefault (0 :: Int) w h
in H.insert w (c+1) h
member = \k -> H.member k h
frequency = \k -> H.lookupDefault 0 k h
return (member, frequency)
使用惰性文本可能會更有效-需要進行調查。
這是我實現的其余部分-在很大程度上遵循了Norvig,但是我做出了一些其他選擇,您可能會覺得很有趣:
{-# LANGUAGE OverloadedStrings #-}
module SpellText
where
import qualified Data.Text as T
import Data.Text (Text)
import Data.Monoid
import Data.List.Ordered (nubSort)
import Data.Ord
import Data.List
import Control.Monad
type Dict = ( Text -> Bool, Text -> Int )
singles :: [ Text ]
singles = map T.singleton ['a'..'z']
edits :: Text -> [ Text ]
edits w = deletes <> nubSort (transposes <> replaces) <> inserts
where
splits = zip (T.inits w) (T.tails w)
deletes = [ a <> (T.drop 1 b) | (a,b) <- splits, T.length b > 0 ]
transposes = [ a <> c <> (T.drop 2 b) | (a,b) <- splits, T.length b > 1,
let c = T.pack [ T.index b 1, T.index b 0 ] ]
replaces = [ a <> c <> (T.drop 1 b) | (a,b) <- splits, T.length b > 1,
c <- singles ]
inserts = [ a <> c <> b | (a,b) <- splits, c <- singles ]
orElse :: [a] -> [a] -> [a]
orElse [] bs = bs
orElse as _ = as
-- | Correct a word. 'isMember' and 'frequency' are functions to
-- determine if a word is in the dictionary and to lookup its
-- frequency, respectively.
correct :: Dict -> Text -> Text
correct (isMember,frequency) w0 =
let ed0 = [ w0 ]
ed1 = edits w0
ed2 = [ e2 | e1 <- ed1, e2 <- edits e1 ]
kn0 = filter isMember ed0
kn1 = filter isMember ed1
kn2 = filter isMember ed2
candidates = kn0 `orElse` (kn1 `orElse` (kn2 `orElse` [w0]))
in maximumBy (comparing frequency) candidates
用法如下:
{-# LANGUAGE OverloadedStrings #-}
import ... -- import the above code
main = do
dictfns <- readDict
print $ correct dictfns "howwa"
我測得的校正時間與Python版本相當-可能快10%。
恕我直言,它看起來並不慢。 nwords
消耗最多的時間。 另一個功能, correct
,實際上是非常快的。 在我的機器上, nwords
大約需要6.5s,而correct
時間少於0.1s,這與Peter Norvigs的目標是每秒10個單詞的處理速度相匹配。
通過force
train
函數的輸入和輸出,我可以將性能提高20%。
我用ghc -o Spl -O2 Spl.hs
編譯了程序。
module Main (nwords, correct, main)
where
import Data.Char (toLower)
import Data.Ord (comparing)
import Data.List (maximumBy, splitAt, foldl')
import Text.Regex.TDFA (getAllTextMatches, (=~))
import Control.DeepSeq (deepseq, force)
import Control.Exception (evaluate)
import Data.Time.Clock.POSIX
import qualified Data.Set as Set
import qualified Data.Map.Strict as Map
type NWords = Map.Map String Int
alphabet :: String
alphabet = enumFromTo 'a' 'z'
nwords :: String -> Map.Map String Int
nwords = train . words'
uniqueElems :: Ord a => [a] -> [a]
uniqueElems = uniq' Set.empty
where uniq' _ [] = []
uniq' seen (x:xs)
| x `Set.member` seen = uniq' seen xs
| otherwise = x:uniq' (x `Set.insert` seen) xs
words' :: String -> [String]
words' = getAllTextMatches . flip (=~) "[a-z]+" . map toLower
-- have a 20% performance improvement by using 'force' on input and output
train :: [String] -> NWords
train = force . foldl' populate Map.empty . force
where populate m feature = Map.insertWith (+) feature 1 m
edits :: String -> [String]
edits word = uniqueElems $ concat [dels, trans, repl, ins]
where dels = [a ++ tail b | (a,b) <- splits, nn b]
trans = [ a ++ (b!!1):head b:tail (tail b) | (a,b) <- splits
, length b > 1]
repl = [a ++ c:tail b | (a,b) <- splits, c <- alphabet, nn b]
ins = [a ++ c:b | (a,b) <- splits, c <- alphabet]
splits = [splitAt n word | n <- [0..length word]]
nn = not . null
knownEdits :: NWords -> String -> [String]
knownEdits nw word = uniqueElems [ e2 | e1 <- edits word, e2 <- edits e1
, Map.member e2 nw]
known :: NWords -> [String] -> [String]
known nw = uniqueElems . filter (`Map.member` nw)
correct :: NWords -> String -> String
correct nw word = fst $ maximumBy (comparing snd) candidates
where candidates = [(w, Map.findWithDefault 0 w nw) | w <- result]
result = head $ filter (not . null) start
start = [ known nw [word], known nw $ edits word
, knownEdits nw word , [word]]
main = do
time0 <- getPOSIXTime
t <- readFile "big.txt"
time1 <- getPOSIXTime
putStrLn $ ":: readFile: "++(show $ time1-time0)
let nw = nwords t
evaluate $ force nw
time2 <- getPOSIXTime
putStrLn $ ":: nwords: " ++ (show $ time2-time1)
putStrLn $ correct nw "speling"
putStrLn $ correct nw "miracl"
putStrLn $ correct nw "helllo"
putStrLn $ correct nw "rabit"
putStrLn $ correct nw "kitteen"
putStrLn $ correct nw "breaks"
putStrLn $ correct nw "sometheeng"
putStrLn $ correct nw "clessical"
putStrLn $ correct nw "theater"
putStrLn $ correct nw "dishis"
time3 <- getPOSIXTime
putStrLn $ ":: correcting: " ++ (show $ time3-time2)
let a = time1-time0
let b = time2-time1
let c = time3-time2
let total = time3 - time0
putStrLn $ ":: total: "++(show $ time3-time0)
這是我的輸出:
:: readFile: 0.000202s
:: nwords: 6.063617s
spelling
miracle
hello
habit
kitten
breaks
something
classical
theater
dishes
:: correcting: 0.749441s
:: total: 6.81326s
此外,如果不使用正則表達式, nwords
函數的速度將提高一倍:
words' :: String -> [String]
words' str = map (map toLower) $ words str
where words str = if (null a)
then (if null b then [] else words f)
else a:(words d)
where (a,b) = span isAlpha str
(c,d) = break isAlpha b
(e,f) = break isAlpha str
有趣的是,如果您嘗試更正未知單詞,則需要花費更長的時間來更正它們。
(我是一位Haskell初學者,我正試圖通過回答stackoverflow問題以及編寫一些玩具程序來學習該語言。)
關於內存消耗:
在我的系統上,此版本最多僅占用21MB。 如果您使用正則表達式或在train
使用force
,似乎存在空間泄漏。 如果兩者都省略,則表現良好。 我認為這與以下事實有關:自寫words'
功能比正則表達式words'
功能更懶。
module Main (nwords, correct, main)
where
import Data.Char (toLower)
import Data.Ord (comparing)
import Data.List (maximumBy, splitAt, foldl')
import Control.DeepSeq (force)
import Control.Exception (evaluate)
import Data.Time.Clock.POSIX
import Data.Char (isAlpha)
import qualified Data.Set as Set
import qualified Data.Map.Strict as Map
type NWords = Map.Map String Int
alphabet :: String
alphabet = enumFromTo 'a' 'z'
nwords :: String -> Map.Map String Int
nwords = train . words'
uniqueElems :: Ord a => [a] -> [a]
uniqueElems = uniq' Set.empty
where uniq' _ [] = []
uniq' seen (x:xs)
| x `Set.member` seen = uniq' seen xs
| otherwise = x:uniq' (x `Set.insert` seen) xs
words' :: String -> [String]
words' str = map (map toLower) $ words str
where words str = if (null a)
then (if null b then [] else words f)
else a:(words d)
where (a,b) = span isAlpha str
(c,d) = break isAlpha b
(e,f) = break isAlpha str
train :: [String] -> NWords
train = foldl' populate Map.empty
where populate m feature = Map.insertWith (+) feature 1 m
edits :: String -> [String]
edits word = uniqueElems $ concat [dels, trans, repl, ins]
where dels = [a ++ tail b | (a,b) <- splits, nn b]
trans = [ a ++ (b!!1):head b:tail (tail b) | (a,b) <- splits
, length b > 1]
repl = [a ++ c:tail b | (a,b) <- splits, c <- alphabet, nn b]
ins = [a ++ c:b | (a,b) <- splits, c <- alphabet]
splits = [splitAt n word | n <- [0..length word]]
nn = not . null
knownEdits :: NWords -> String -> [String]
knownEdits nw word = uniqueElems [ e2 | e1 <- edits word, e2 <- edits e1
, Map.member e2 nw]
known :: NWords -> [String] -> [String]
known nw = uniqueElems . filter (`Map.member` nw)
correct :: NWords -> String -> String
correct nw word = fst $ maximumBy (comparing snd) candidates
where candidates = [(w, Map.findWithDefault 0 w nw) | w <- result]
result = head $ filter (not . null) start
start = [ known nw [word], known nw $ edits word
, knownEdits nw word , [word]]
main = do
time0 <- getPOSIXTime
t <- readFile "big.txt"
time1 <- getPOSIXTime
putStrLn $ ":: readFile: "++(show $ time1-time0)
let nw = nwords t
evaluate $ force nw
time2 <- getPOSIXTime
putStrLn $ ":: nwords: " ++ (show $ time2-time1)
putStrLn $ correct nw "speling"
putStrLn $ correct nw "miracl"
putStrLn $ correct nw "helllo"
putStrLn $ correct nw "rabit"
putStrLn $ correct nw "kitteen"
putStrLn $ correct nw "breaks"
putStrLn $ correct nw "sometheeng"
putStrLn $ correct nw "clessical"
putStrLn $ correct nw "theater"
putStrLn $ correct nw "dishis"
time3 <- getPOSIXTime
putStrLn $ ":: correcting: " ++ (show $ time3-time2)
let a = time1-time0
let b = time2-time1
let c = time3-time2
let total = time3 - time0
putStrLn $ ":: total: "++(show $ time3-time0)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.