[英]Optimizing Memory in Haskell, pipes, attoparsec, and containers
我正在嘗試進一步優化我的管道 - attoparsec解析器和存儲,但無法獲得更低的內存使用率。
給出account-parser.hs
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-}
import Protolude hiding (for)
import Data.Hashable
import Data.IntMap.Strict (IntMap)
import Data.Vector (Vector)
import Pipes
import Pipes.Parse
import Pipes.Safe (MonadSafe, runSafeT)
import qualified Data.Attoparsec.ByteString.Char8 as AB
import qualified Data.IntMap.Strict as IM
import qualified Data.Vector as Vector
import qualified Pipes.Attoparsec as PA
import qualified Pipes.ByteString as PB
import qualified Pipes.Safe.Prelude as PSP
-- accountid|account-name|contractid|code
data AccountLine = AccountLine {
_accountId :: !ByteString,
_accountName :: !ByteString,
_accountContractId :: !ByteString,
_accountCode :: !Word32
} deriving (Show)
type MapCodetoAccountIdIdx = IntMap Int
data Accounts = Accounts {
_accountIds :: !(Vector ByteString),
_cache :: !(IntMap Int),
_accountCodes :: !MapCodetoAccountIdIdx
} deriving (Show)
parseAccountLine :: AB.Parser AccountLine
parseAccountLine = AccountLine <$>
getSubfield <* delim <*>
getSubfield <* delim <*>
getSubfield <* delim <*>
AB.decimal <* AB.endOfLine
where getSubfield = AB.takeTill (== '|')
delim = AB.char '|'
--
aempty :: Accounts
aempty = Accounts Vector.empty IM.empty IM.empty
aappend :: Accounts -> AccountLine -> Accounts
aappend (Accounts ids a2i cps) (AccountLine aid an cid cp) =
case IM.lookup (hash aid) a2i of
Nothing -> Accounts
(Vector.snoc ids (toS aid))
(IM.insert (hash aid) (length ids) a2i)
(IM.insert (fromIntegral cp) (length ids) cps)
Just idx -> Accounts ids a2i (IM.insert (fromIntegral cp) idx cps)
foldAccounts :: (Monad m) => Parser AccountLine m Accounts
foldAccounts = foldAll aappend aempty identity
readByteStringFile :: (MonadSafe m) => FilePath -> Producer' ByteString m ()
readByteStringFile file = PSP.withFile file ReadMode PB.fromHandle
accountLines :: Text -> MonadSafe m => Producer AccountLine m (Either (PA.ParsingError, Producer ByteString m ()) ())
accountLines filename = PA.parsed parseAccountLine (readByteStringFile (toS filename))
main :: IO ()
main = do
[filename] <- getArgs
x <- runSafeT $ runEffect $ Pipes.Parse.evalStateT foldAccounts (accountLines (toS filename))
print $ sizes x
sizes :: Accounts -> (Int, Int, Int)
sizes (Accounts aid xxx acp) = (Vector.length aid, IM.size xxx, IM.size acp)
用GHC 8.0.2編譯( stack ghc -- -O2 -rtsopts -threaded -Wall account-parser.hs
)
我無法將內存使用率降低。 我必須快速查看因此IntMaps。 該文件大約20 MB(並沒有效率)。 大多數數據應該能夠容納5 MB。
$ ./account-parser /tmp/accounts +RTS -s
(5837,5837,373998)
1,631,040,680 bytes allocated in the heap
221,765,464 bytes copied during GC
41,709,048 bytes maximum residency (13 sample(s))
2,512,560 bytes maximum slop
82 MB total memory in use (0 MB lost due to fragmentation)
Tot time (elapsed) Avg pause Max pause
Gen 0 2754 colls, 0 par 0.105s 0.142s 0.0001s 0.0002s
Gen 1 13 colls, 0 par 0.066s 0.074s 0.0057s 0.0216s
TASKS: 4 (1 bound, 3 peak workers (3 total), using -N1)
SPARKS: 0 (0 converted, 0 overflowed, 0 dud, 0 GC'd, 0 fizzled)
INIT time 0.000s ( 0.001s elapsed)
MUT time 0.324s ( 0.298s elapsed)
GC time 0.171s ( 0.216s elapsed)
EXIT time 0.000s ( 0.005s elapsed)
Total time 0.495s ( 0.520s elapsed)
Alloc rate 5,026,660,297 bytes per MUT second
Productivity 65.5% of total user, 58.4% of total elapsed
gc_alloc_block_sync: 0
whitehole_spin: 0
gen[0].sync: 0
gen[1].sync: 0
和簡介:
如果我,
HashMap Text (Set Word32)
+RTS -c
我可以將總內存減少到34 MB,但我的查找現在轉到O(n)。 這可能是我最好的。
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-}
import Protolude hiding (for)
import qualified Data.Attoparsec.ByteString.Char8 as AB
import Data.HashMap.Strict (HashMap)
import qualified Data.HashMap.Strict as HashMap
import Data.Set (Set)
import qualified Data.Set as Set
import Pipes
import qualified Pipes.Attoparsec as PA
import qualified Pipes.ByteString as PB
import Pipes.Parse
import Pipes.Safe (MonadSafe, runSafeT)
import qualified Pipes.Safe.Prelude as PSP
-- accountid|account-name|contractid|code
data AccountLine = AccountLine {
_accountId :: !ByteString,
_accountName :: !ByteString,
_accountContractId :: !ByteString,
_accountCode :: !Word32
} deriving (Show)
newtype Accounts = Accounts (HashMap Text (Set Word32))
deriving (Show)
parseAccountLine :: AB.Parser AccountLine
parseAccountLine = AccountLine <$>
getSubfield <* delim <*>
getSubfield <* delim <*>
getSubfield <* delim <*>
AB.decimal <* AB.endOfLine
where getSubfield = AB.takeTill (== '|')
delim = AB.char '|'
--
aempty :: Accounts
aempty = Accounts HashMap.empty
aappend :: Accounts -> AccountLine -> Accounts
aappend (Accounts cps) (AccountLine aid an cid cp) =
case HashMap.lookup (toS aid) cps of
Nothing -> Accounts (HashMap.insert (toS aid) (Set.singleton cp) cps)
Just value -> Accounts (HashMap.update (\codes -> Just (Set.insert cp value)) (toS aid) cps)
foldAccounts :: (Monad m) => Parser AccountLine m Accounts
foldAccounts = foldAll aappend aempty identity
readByteStringFile :: (MonadSafe m) => FilePath -> Producer' ByteString m ()
readByteStringFile file = PSP.withFile file ReadMode PB.fromHandle
accountLines :: Text -> MonadSafe m => Producer AccountLine m (Either (PA.ParsingError, Producer ByteString m ()) ())
accountLines filename = PA.parsed parseAccountLine (readByteStringFile (toS filename))
main :: IO ()
main = do
[filename] <- getArgs
x <- runSafeT $ runEffect $ Pipes.Parse.evalStateT foldAccounts (accountLines (toS filename))
print $ sizes x
-- print x
print $ lookupAccountFromCode x 254741
print $ lookupAccountFromCode x 196939
sizes :: Accounts -> Int
sizes (Accounts acp) = HashMap.size acp
lookupAccountFromCode :: Accounts -> Word32 -> Maybe Text
lookupAccountFromCode (Accounts accts) cp = do
let f a k v = bool a (Just k) (Set.member cp v)
HashMap.foldlWithKey' f Nothing accts
跑步
$ ./account-parser /tmp/accounts +RTS -s -c
5837
Just "1-PCECJ5"
Just "AANA-76KOUU"
1,652,177,904 bytes allocated in the heap
83,767,440 bytes copied during GC
17,563,800 bytes maximum residency (18 sample(s))
751,144 bytes maximum slop
34 MB total memory in use (0 MB lost due to fragmentation)
Tot time (elapsed) Avg pause Max pause
Gen 0 3083 colls, 0 par 0.058s 0.069s 0.0000s 0.0002s
Gen 1 18 colls, 0 par 0.115s 0.151s 0.0084s 0.0317s
TASKS: 4 (1 bound, 3 peak workers (3 total), using -N1)
SPARKS: 0 (0 converted, 0 overflowed, 0 dud, 0 GC'd, 0 fizzled)
INIT time 0.000s ( 0.002s elapsed)
MUT time 0.263s ( 0.289s elapsed)
GC time 0.173s ( 0.219s elapsed)
EXIT time 0.009s ( 0.008s elapsed)
Total time 0.445s ( 0.518s elapsed)
Alloc rate 6,286,682,587 bytes per MUT second
Productivity 61.0% of total user, 57.4% of total elapsed
gc_alloc_block_sync: 0
whitehole_spin: 0
gen[0].sync: 0
gen[1].sync: 0
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.