As an exercise, I wrote Peter Norvig's spelling corrector algorithm in Haskell:
module Spl (nwords, correct)
where
import Data.Char (toLower)
import Data.Ord (comparing)
import Data.List (maximumBy, splitAt, foldl')
import Text.Regex.TDFA (getAllTextMatches, (=~))
import qualified Data.Set as Set
import qualified Data.Map.Strict as Map
type NWords = Map.Map String Int
alphabet :: String
alphabet = enumFromTo 'a' 'z'
nwords :: String -> Map.Map String Int
nwords = train . words'
uniqueElems :: Ord a => [a] -> [a]
uniqueElems = uniq' Set.empty
where uniq' _ [] = []
uniq' seen (x:xs)
| x `Set.member` seen = uniq' seen xs
| otherwise = x:uniq' (x `Set.insert` seen) xs
words' :: String -> [String]
words' = getAllTextMatches . flip (=~) "[a-z]+" . map toLower
train :: [String] -> NWords
train = foldl' populate Map.empty
where populate m feature = Map.insertWith (+) feature 1 m
edits :: String -> [String]
edits word = uniqueElems $ concat [dels, trans, repl, ins]
where dels = [a ++ tail b | (a,b) <- splits, nn b]
trans = [ a ++ (b!!1):head b:tail (tail b) | (a,b) <- splits
, length b > 1]
repl = [a ++ c:tail b | (a,b) <- splits, c <- alphabet, nn b]
ins = [a ++ c:b | (a,b) <- splits, c <- alphabet]
splits = [splitAt n word | n <- [0..length word]]
nn = not . null
knownEdits :: NWords -> String -> [String]
knownEdits nw word = uniqueElems [ e2 | e1 <- edits word, e2 <- edits e1
, Map.member e2 nw]
known :: NWords -> [String] -> [String]
known nw = uniqueElems . filter (`Map.member` nw)
correct :: NWords -> String -> String
correct nw word = fst $ maximumBy (comparing snd) candidates
where candidates = [(w, Map.findWithDefault 0 w nw) | w <- result]
result = head $ filter (not . null) start
start = [ known nw [word], known nw $ edits word
, knownEdits nw word , [word]]
Usage
This is how I would use it:
ghci> t <- readFile "big.txt"
ghci> let nw = nwords t
ghci> correct nw "speling"
"spelling"
The big.txt
file is available on Peter Norvig's site (direct link, 6.2MB): http://norvig.com/big.txt
Problems
train
function is much slower, than words'
, so it's the bottleneck. So, where did I get it wrong? Do I have a memory leak somewhere?
My main suggestions are:
Data.HashMap.Strict
The following code can load all of big.txt
into a Data.Hashmap.Strict
in about 2 secs. Memory usage is about 25 MB (on a 64-bit system):
import qualified Data.HashMap.Strict as H
import qualified Data.Text as T
import qualified Data.Text.IO as T
import Data.List
isAlpha ch = ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z')
wrds :: T.Text -> [ T.Text ]
wrds bs =
let
(_, r1) = T.span (not . isAlpha) bs
(w, r2) = T.span isAlpha r1
in if T.null w then [] else T.toLower w : wrds r2
readDict = do
allwords <- fmap wrds $ T.readFile "big.txt"
let h = foldl' add H.empty all words
add h w = let c = H.lookupDefault (0 :: Int) w h
in H.insert w (c+1) h
member = \k -> H.member k h
frequency = \k -> H.lookupDefault 0 k h
return (member, frequency)
It might be more efficient to use lazy Text - something to investigate.
Here's the rest of my implementation - pretty much follows Norvig but I've made some other choices which you might find interesting:
{-# LANGUAGE OverloadedStrings #-}
module SpellText
where
import qualified Data.Text as T
import Data.Text (Text)
import Data.Monoid
import Data.List.Ordered (nubSort)
import Data.Ord
import Data.List
import Control.Monad
type Dict = ( Text -> Bool, Text -> Int )
singles :: [ Text ]
singles = map T.singleton ['a'..'z']
edits :: Text -> [ Text ]
edits w = deletes <> nubSort (transposes <> replaces) <> inserts
where
splits = zip (T.inits w) (T.tails w)
deletes = [ a <> (T.drop 1 b) | (a,b) <- splits, T.length b > 0 ]
transposes = [ a <> c <> (T.drop 2 b) | (a,b) <- splits, T.length b > 1,
let c = T.pack [ T.index b 1, T.index b 0 ] ]
replaces = [ a <> c <> (T.drop 1 b) | (a,b) <- splits, T.length b > 1,
c <- singles ]
inserts = [ a <> c <> b | (a,b) <- splits, c <- singles ]
orElse :: [a] -> [a] -> [a]
orElse [] bs = bs
orElse as _ = as
-- | Correct a word. 'isMember' and 'frequency' are functions to
-- determine if a word is in the dictionary and to lookup its
-- frequency, respectively.
correct :: Dict -> Text -> Text
correct (isMember,frequency) w0 =
let ed0 = [ w0 ]
ed1 = edits w0
ed2 = [ e2 | e1 <- ed1, e2 <- edits e1 ]
kn0 = filter isMember ed0
kn1 = filter isMember ed1
kn2 = filter isMember ed2
candidates = kn0 `orElse` (kn1 `orElse` (kn2 `orElse` [w0]))
in maximumBy (comparing frequency) candidates
Usage goes like this:
{-# LANGUAGE OverloadedStrings #-}
import ... -- import the above code
main = do
dictfns <- readDict
print $ correct dictfns "howwa"
My measured correction time is comparable to the Python version - perhaps 10% faster.
IMHO, it's not that slow as it seems. The most time is consumed in nwords
. The other function, correct
, is actually quite fast. On my machine, nwords
takes about 6.5s, and correct
takes less then 0.1s, which matches the processing speed of 10 words per seconds, which was Peter Norvigs target.
I could make a 20% performance improvement by force
ing the input and output of the train
function.
I compiled the program with ghc -o Spl -O2 Spl.hs
.
module Main (nwords, correct, main)
where
import Data.Char (toLower)
import Data.Ord (comparing)
import Data.List (maximumBy, splitAt, foldl')
import Text.Regex.TDFA (getAllTextMatches, (=~))
import Control.DeepSeq (deepseq, force)
import Control.Exception (evaluate)
import Data.Time.Clock.POSIX
import qualified Data.Set as Set
import qualified Data.Map.Strict as Map
type NWords = Map.Map String Int
alphabet :: String
alphabet = enumFromTo 'a' 'z'
nwords :: String -> Map.Map String Int
nwords = train . words'
uniqueElems :: Ord a => [a] -> [a]
uniqueElems = uniq' Set.empty
where uniq' _ [] = []
uniq' seen (x:xs)
| x `Set.member` seen = uniq' seen xs
| otherwise = x:uniq' (x `Set.insert` seen) xs
words' :: String -> [String]
words' = getAllTextMatches . flip (=~) "[a-z]+" . map toLower
-- have a 20% performance improvement by using 'force' on input and output
train :: [String] -> NWords
train = force . foldl' populate Map.empty . force
where populate m feature = Map.insertWith (+) feature 1 m
edits :: String -> [String]
edits word = uniqueElems $ concat [dels, trans, repl, ins]
where dels = [a ++ tail b | (a,b) <- splits, nn b]
trans = [ a ++ (b!!1):head b:tail (tail b) | (a,b) <- splits
, length b > 1]
repl = [a ++ c:tail b | (a,b) <- splits, c <- alphabet, nn b]
ins = [a ++ c:b | (a,b) <- splits, c <- alphabet]
splits = [splitAt n word | n <- [0..length word]]
nn = not . null
knownEdits :: NWords -> String -> [String]
knownEdits nw word = uniqueElems [ e2 | e1 <- edits word, e2 <- edits e1
, Map.member e2 nw]
known :: NWords -> [String] -> [String]
known nw = uniqueElems . filter (`Map.member` nw)
correct :: NWords -> String -> String
correct nw word = fst $ maximumBy (comparing snd) candidates
where candidates = [(w, Map.findWithDefault 0 w nw) | w <- result]
result = head $ filter (not . null) start
start = [ known nw [word], known nw $ edits word
, knownEdits nw word , [word]]
main = do
time0 <- getPOSIXTime
t <- readFile "big.txt"
time1 <- getPOSIXTime
putStrLn $ ":: readFile: "++(show $ time1-time0)
let nw = nwords t
evaluate $ force nw
time2 <- getPOSIXTime
putStrLn $ ":: nwords: " ++ (show $ time2-time1)
putStrLn $ correct nw "speling"
putStrLn $ correct nw "miracl"
putStrLn $ correct nw "helllo"
putStrLn $ correct nw "rabit"
putStrLn $ correct nw "kitteen"
putStrLn $ correct nw "breaks"
putStrLn $ correct nw "sometheeng"
putStrLn $ correct nw "clessical"
putStrLn $ correct nw "theater"
putStrLn $ correct nw "dishis"
time3 <- getPOSIXTime
putStrLn $ ":: correcting: " ++ (show $ time3-time2)
let a = time1-time0
let b = time2-time1
let c = time3-time2
let total = time3 - time0
putStrLn $ ":: total: "++(show $ time3-time0)
And this is my output:
:: readFile: 0.000202s
:: nwords: 6.063617s
spelling
miracle
hello
habit
kitten
breaks
something
classical
theater
dishes
:: correcting: 0.749441s
:: total: 6.81326s
Furthermore, the nwords
function becomes twice as fast if you don't use regexes:
words' :: String -> [String]
words' str = map (map toLower) $ words str
where words str = if (null a)
then (if null b then [] else words f)
else a:(words d)
where (a,b) = span isAlpha str
(c,d) = break isAlpha b
(e,f) = break isAlpha str
Interestingly, if you try to correct unknown words, it takes much longer to correct them.
(I'm a haskell beginner, and I'm trying to learn the language by answering stackoverflow questions, and by writing some toy programs.)
about the memory consumption:
This version just takes 21MB at most, at my system. It seems that there is a space leak if you either use regexes, or if you use force
in train
. If you omit both, it behaves fine. I think it has to do with the fact that the self-written words'
function is more lazy than the regex- words'
function.
module Main (nwords, correct, main)
where
import Data.Char (toLower)
import Data.Ord (comparing)
import Data.List (maximumBy, splitAt, foldl')
import Control.DeepSeq (force)
import Control.Exception (evaluate)
import Data.Time.Clock.POSIX
import Data.Char (isAlpha)
import qualified Data.Set as Set
import qualified Data.Map.Strict as Map
type NWords = Map.Map String Int
alphabet :: String
alphabet = enumFromTo 'a' 'z'
nwords :: String -> Map.Map String Int
nwords = train . words'
uniqueElems :: Ord a => [a] -> [a]
uniqueElems = uniq' Set.empty
where uniq' _ [] = []
uniq' seen (x:xs)
| x `Set.member` seen = uniq' seen xs
| otherwise = x:uniq' (x `Set.insert` seen) xs
words' :: String -> [String]
words' str = map (map toLower) $ words str
where words str = if (null a)
then (if null b then [] else words f)
else a:(words d)
where (a,b) = span isAlpha str
(c,d) = break isAlpha b
(e,f) = break isAlpha str
train :: [String] -> NWords
train = foldl' populate Map.empty
where populate m feature = Map.insertWith (+) feature 1 m
edits :: String -> [String]
edits word = uniqueElems $ concat [dels, trans, repl, ins]
where dels = [a ++ tail b | (a,b) <- splits, nn b]
trans = [ a ++ (b!!1):head b:tail (tail b) | (a,b) <- splits
, length b > 1]
repl = [a ++ c:tail b | (a,b) <- splits, c <- alphabet, nn b]
ins = [a ++ c:b | (a,b) <- splits, c <- alphabet]
splits = [splitAt n word | n <- [0..length word]]
nn = not . null
knownEdits :: NWords -> String -> [String]
knownEdits nw word = uniqueElems [ e2 | e1 <- edits word, e2 <- edits e1
, Map.member e2 nw]
known :: NWords -> [String] -> [String]
known nw = uniqueElems . filter (`Map.member` nw)
correct :: NWords -> String -> String
correct nw word = fst $ maximumBy (comparing snd) candidates
where candidates = [(w, Map.findWithDefault 0 w nw) | w <- result]
result = head $ filter (not . null) start
start = [ known nw [word], known nw $ edits word
, knownEdits nw word , [word]]
main = do
time0 <- getPOSIXTime
t <- readFile "big.txt"
time1 <- getPOSIXTime
putStrLn $ ":: readFile: "++(show $ time1-time0)
let nw = nwords t
evaluate $ force nw
time2 <- getPOSIXTime
putStrLn $ ":: nwords: " ++ (show $ time2-time1)
putStrLn $ correct nw "speling"
putStrLn $ correct nw "miracl"
putStrLn $ correct nw "helllo"
putStrLn $ correct nw "rabit"
putStrLn $ correct nw "kitteen"
putStrLn $ correct nw "breaks"
putStrLn $ correct nw "sometheeng"
putStrLn $ correct nw "clessical"
putStrLn $ correct nw "theater"
putStrLn $ correct nw "dishis"
time3 <- getPOSIXTime
putStrLn $ ":: correcting: " ++ (show $ time3-time2)
let a = time1-time0
let b = time2-time1
let c = time3-time2
let total = time3 - time0
putStrLn $ ":: total: "++(show $ time3-time0)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.