[英]Haskell based file streaming causes memory leak
我對 Haskell 還很陌生,我正在研究現有的代碼庫,該代碼庫從文件共享中收集文件。 為了並行處理文件共享,使用了Conduit
。 腳手架基於本教程。 為了連續讀取文件共享,我添加了延遲和對streamFile
function 的遞歸調用。 我不確定這是否是問題所在,但 memory 分配不斷增加,高達數 GB。
導致 memory 泄漏的問題可能是什么?
module FileScraper(runFileScraperFinal, FileScraper, watch, watchDirectories) where
import Actions (PostProcAction)
import Colog (LogAction, Msg, Severity)
import Conduit (ConduitM, ConduitT, MonadIO (..), MonadResource, MonadTrans (lift), MonadUnliftIO (withRunInIO), ResourceT, await, bracketP, mapMC, mapM_C, runConduit, runResourceT, yield, (.|), takeWhileC)
import Control.Concurrent (threadDelay)
import qualified Control.Concurrent.Async as Async
import qualified Control.Concurrent.STM as STM
import qualified Control.Concurrent.STM.TBMQueue as STM
import Data.ByteString (ByteString, readFile)
import Data.Conduit.Combinators (filterM, yieldMany)
import Data.Functor ((<&>))
import Data.Text (Text, unpack)
import Filters (FileFilter, DirectoryFilter)
import Polysemy (Final, Inspector (inspect), Member, Sem, makeSem)
import Polysemy.Final (bindS, getInitialStateS, getInspectorS, interpretFinal, liftS)
import Prelude hiding (filter, init, readFile)
import System.FilePath.Find (find, RecursionPredicate, (/~?), filePath, (&&?), (==?), fileType, FileType (RegularFile), always)
import System.Posix (raiseSignal, sigTERM)
data FileScraper m a where
Watch :: [(Text, Text, FileFilter, DirectoryFilter, PostProcAction)] -> (FilePath -> ByteString -> Text -> PostProcAction -> m Bool) -> FileScraper m ()
makeSem ''FileScraper
runFileScraperFinal :: forall m. (MonadUnliftIO m => forall r a. (Member (Final m) r) => LogAction m (Msg Severity) -> Sem (FileScraper ': r) a -> Sem r a)
runFileScraperFinal _ = do
interpretFinal @m (\case
Watch sources callback -> do
is <- getInitialStateS
ins <- getInspectorS
cb' <- bindS $ uncurry4 callback
liftS $ withRunInIO $ \runInIO -> liftIO $ do
runResourceT . runConduit $ watchDirectories sources .| mapMC (\(fp,fc,dest,ppa) -> lift $ do
eff <- runInIO $ cb' ((fp,fc,dest,ppa) <$ is)
case inspect ins eff of
Nothing -> do
raiseSignal sigTERM
pure False
Just v -> do
pure v
) .| takeWhileC id .| mapM_C (const $ pure ())
)
uncurry4 :: (a -> b -> c -> d -> e) -> ((a, b, c, d) -> e)
uncurry4 f ~(a,b,c,d) = f a b c d
watchDirectories :: MonadResource m => [(Text, Text, FileFilter, DirectoryFilter, PostProcAction)] -> ConduitM a (FilePath, ByteString, Text, PostProcAction) m ()
watchDirectories sourceToFilterMap = parSources (fmap (\(src, dest, filter, dirFilter, postProcActions) -> streamFile (unpack src) dest filter dirFilter postProcActions) sourceToFilterMap)
streamFile :: MonadResource m => FilePath -> Text -> FileFilter -> DirectoryFilter -> PostProcAction -> ConduitM a (FilePath, ByteString, Text, PostProcAction) m ()
streamFile baseDir destination filter dirFilter postProcActions = do
newFiles <- liftIO $ find (recursionPredicate dirFilter) (fileType ==? RegularFile) baseDir
yieldMany newFiles .| filterM (liftIO . filter) .| mapMC (\entry -> do
liftIO $ readFile entry <&> (entry,,destination,postProcActions))
let minutes :: Int = 60_000_000
liftIO $ threadDelay (5 * minutes)
streamFile baseDir destination filter dirFilter postProcActions
where
recursionPredicate :: DirectoryFilter -> RecursionPredicate
recursionPredicate df = case df of
[] -> always
excludes -> foldl1 (&&?) $ map ((/~?) filePath . unpack) excludes
parSources :: (MonadResource m, Foldable f) => f (ConduitM () o (ResourceT IO) ()) -> ConduitT i o m ()
parSources sources = bracketP init cleanup finalSource
where
init = do
-- create the queue where all sources will put their items
queue <- STM.newTBMQueueIO 100
-- In a separate thread, run concurrently all conduits
a <- Async.async $ do
Async.mapConcurrently_ (\source -> runResourceT $ runConduit (source .| sinkQueue queue)) sources
-- once all conduits are done, close the queue
STM.atomically (STM.closeTBMQueue queue)
pure (a, queue)
cleanup (async, queue) = do
-- upon exception or cancellation, close the queue and cancel the threads
STM.atomically (STM.closeTBMQueue queue)
Async.cancel async
finalSource (_, queue) = sourceQueue queue
sourceQueue :: MonadIO m => STM.TBMQueue o -> ConduitT i o m ()
sourceQueue queue = do
mbItem <- liftIO $ STM.atomically (STM.readTBMQueue queue)
case mbItem of
Nothing -> pure () -- queue closed
Just item -> yield item *> sourceQueue queue
sinkQueue :: MonadIO m => STM.TBMQueue a -> ConduitT a o m ()
sinkQueue queue = do
mbItem <- await
case mbItem of
Nothing -> pure () -- no more items to come
Just item -> do
liftIO $ STM.atomically (STM.writeTBMQueue queue item)
sinkQueue queue
更新(添加了使用回調的 function):
...
void $ async $ watch normalisedPrefixedSources (\fp content dest ppa -> do
log Info $ "Sending file " <> pack fp
result <- await =<< send (unpack dest) content
case result of
Just True -> do
log Info $ "File sent " <> pack fp
res <- embed @m $ liftIO $ ppa fp
if res then pure True else do
log Error "Raise signal for graceful shutdown."
embed @m $ liftIO $ raiseSignal sigTERM
pure False
_ -> do
log Error $ "Error sending file " <> pack fp <> ". Raise signal for graceful shutdown."
embed @m $ liftIO $ raiseSignal sigTERM
pure False
)
...
更新 2 :從配置中刪除冪等過濾器后(@KA Buhr 的更改仍然存在)memory 消耗是恆定的。
type FileFilter = FilePath -> IO Bool
createIdempotentFilter :: LogAction IO Message -> M.Idempotent -> IO FileFilter
createIdempotentFilter la filterConfig = do
cache <- newIORef []
let configuredCacheSize :: Int = fromIntegral $ M.lruCacheSize filterConfig
pure $ \path -> do
fileModificationEpoch <- getModificationTime path
cache' <- readIORef cache
if (path, fileModificationEpoch) `elem` cache' then do
la <& logText Debug ("File already in cache " <> pack path <> " | " <> pack (show fileModificationEpoch))
pure False
else do
la <& logText Debug ("File not in cache " <> pack path <> " | " <> pack (show fileModificationEpoch))
let alreadyScanned' = cache' <> [(path, fileModificationEpoch)]
writeIORef cache $ drop (length alreadyScanned' - configuredCacheSize) alreadyScanned'
pure True
首先,確保您排除了文件內容的ByteString
作為泄漏源。 您將擁有等於有界隊列長度的最大運行文件數,因此您的高水位線將是來自輸入文件系統的 100 個文件的任意集合的內容。 如果您正在處理具有大型視頻/圖像文件的文件系統,您可能會看到不穩定的瞬態峰值。 此外,如果您的回調持有對(部分或全部)這些文件的路徑名和/或內容的引用,您將看到非常嚴重的空間泄漏。 通過將readFile entry
替換為return mempty
並使用 null 回調(\_ _ _ _ -> return True)
來排除所有這些。
在自己進行了類似的更改后,我能夠復制您的空間泄漏並將其歸結為兩個技術問題。
第一個是:
.| takeWhileC id .| mapM_C (const $ pure ())
將其替換為:
.| Control.Monad.void andC
將通過測試文件系統的單次傳遞的最大駐留從 130MB 減少到 15MB,但在堆配置文件上的堆使用量仍然具有特征性的線性增加。
第二個是:
yield item *> sourceQueue queue
將其替換為:
yield item >> sourceQueue queue
完全消除了泄漏。 最大駐留空間僅為 2MB,並且多次通過測試文件系統的堆配置文件沒有明顯的泄漏。
對於任何一個問題,我都不確定這里發生了什么。 *>
與>>
問題是我以前見過的問題。 雖然這些在語義上是等價的,但它們不一定具有相同的實現,有時*>
會泄漏>>
沒有的空間。 然而, takeWhileC
問題對我來說是個謎。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.