簡體   English   中英

基於 Haskell 的文件流導致 memory 泄漏

[英]Haskell based file streaming causes memory leak

我對 Haskell 還很陌生,我正在研究現有的代碼庫,該代碼庫從文件共享中收集文件。 為了並行處理文件共享,使用了Conduit 腳手架基於本教程 為了連續讀取文件共享,我添加了延遲和對streamFile function 的遞歸調用。 我不確定這是否是問題所在,但 memory 分配不斷增加,高達數 GB。

導致 memory 泄漏的問題可能是什么?

module FileScraper(runFileScraperFinal, FileScraper, watch, watchDirectories) where

import           Actions                         (PostProcAction)
import           Colog                           (LogAction, Msg, Severity)
import           Conduit                         (ConduitM, ConduitT, MonadIO (..), MonadResource, MonadTrans (lift), MonadUnliftIO (withRunInIO), ResourceT, await, bracketP, mapMC, mapM_C, runConduit, runResourceT, yield, (.|), takeWhileC)
import           Control.Concurrent              (threadDelay)
import qualified Control.Concurrent.Async        as Async
import qualified Control.Concurrent.STM          as STM
import qualified Control.Concurrent.STM.TBMQueue as STM
import           Data.ByteString                 (ByteString, readFile)
import           Data.Conduit.Combinators        (filterM, yieldMany)
import           Data.Functor                    ((<&>))
import           Data.Text                       (Text, unpack)
import           Filters                         (FileFilter, DirectoryFilter)
import           Polysemy                        (Final, Inspector (inspect), Member, Sem, makeSem)
import           Polysemy.Final                  (bindS, getInitialStateS, getInspectorS, interpretFinal, liftS)
import           Prelude                         hiding (filter, init, readFile)
import           System.FilePath.Find            (find, RecursionPredicate, (/~?), filePath, (&&?), (==?), fileType, FileType (RegularFile), always)
import           System.Posix                    (raiseSignal, sigTERM)

data FileScraper m a where
  Watch :: [(Text, Text, FileFilter, DirectoryFilter, PostProcAction)] -> (FilePath -> ByteString -> Text -> PostProcAction -> m Bool) -> FileScraper m ()

makeSem ''FileScraper

runFileScraperFinal :: forall m. (MonadUnliftIO m => forall r a. (Member (Final m) r) => LogAction m (Msg Severity) -> Sem (FileScraper ': r) a -> Sem r a)
runFileScraperFinal _ = do
  interpretFinal @m (\case
    Watch sources callback -> do
      is <- getInitialStateS
      ins <- getInspectorS
      cb' <- bindS $ uncurry4 callback
      liftS $ withRunInIO $ \runInIO -> liftIO $ do
        runResourceT . runConduit $ watchDirectories sources .| mapMC (\(fp,fc,dest,ppa) -> lift $ do
          eff <- runInIO $ cb' ((fp,fc,dest,ppa) <$ is)
          case inspect ins eff of
            Nothing -> do
              raiseSignal sigTERM
              pure False
            Just v -> do
              pure v
          ) .| takeWhileC id .| mapM_C (const $ pure ())
    )

uncurry4 :: (a -> b -> c -> d -> e) -> ((a, b, c, d) -> e)
uncurry4 f ~(a,b,c,d) = f a b c d

watchDirectories :: MonadResource m => [(Text, Text, FileFilter, DirectoryFilter, PostProcAction)] -> ConduitM a (FilePath, ByteString, Text, PostProcAction) m ()
watchDirectories sourceToFilterMap = parSources (fmap (\(src, dest, filter, dirFilter, postProcActions) -> streamFile (unpack src) dest filter dirFilter postProcActions) sourceToFilterMap)

streamFile :: MonadResource m => FilePath -> Text -> FileFilter -> DirectoryFilter -> PostProcAction -> ConduitM a (FilePath, ByteString, Text, PostProcAction) m ()
streamFile baseDir destination filter dirFilter postProcActions = do
    newFiles <- liftIO $ find (recursionPredicate dirFilter) (fileType ==? RegularFile) baseDir
    yieldMany newFiles .| filterM (liftIO . filter) .| mapMC (\entry -> do
      liftIO $ readFile entry <&> (entry,,destination,postProcActions))
    let minutes :: Int = 60_000_000
    liftIO $ threadDelay (5 * minutes)
    streamFile baseDir destination filter dirFilter postProcActions
    where
      recursionPredicate :: DirectoryFilter -> RecursionPredicate
      recursionPredicate df = case df of
        [] -> always
        excludes -> foldl1 (&&?) $ map ((/~?) filePath . unpack) excludes

parSources :: (MonadResource m, Foldable f) => f (ConduitM () o (ResourceT IO) ()) -> ConduitT i o m ()
parSources sources = bracketP init cleanup finalSource
  where
    init = do
        -- create the queue where all sources will put their items
        queue <- STM.newTBMQueueIO 100

        -- In a separate thread, run concurrently all conduits
        a <- Async.async $ do
            Async.mapConcurrently_ (\source -> runResourceT $ runConduit (source .| sinkQueue queue)) sources
            -- once all conduits are done, close the queue
            STM.atomically (STM.closeTBMQueue queue)
        pure (a, queue)
    cleanup (async, queue) = do
        -- upon exception or cancellation, close the queue and cancel the threads
        STM.atomically (STM.closeTBMQueue queue)
        Async.cancel async
    finalSource (_, queue) = sourceQueue queue

sourceQueue :: MonadIO m => STM.TBMQueue o -> ConduitT i o m ()
sourceQueue queue = do
        mbItem <- liftIO $ STM.atomically (STM.readTBMQueue queue)
        case mbItem of
            Nothing -> pure ()  -- queue closed
            Just item -> yield item *> sourceQueue queue

sinkQueue :: MonadIO m => STM.TBMQueue a -> ConduitT a o m ()
sinkQueue queue = do
        mbItem <- await
        case mbItem of
            Nothing -> pure ()  -- no more items to come
            Just item -> do
                liftIO $ STM.atomically (STM.writeTBMQueue queue item)
                sinkQueue queue

更新(添加了使用回調的 function):

...
void $ async $ watch normalisedPrefixedSources (\fp content dest ppa -> do
    log Info $ "Sending file " <> pack fp

    result <- await =<< send (unpack dest) content

    case result of
      Just True -> do
        log Info $ "File sent " <> pack fp
        res <- embed @m $ liftIO $ ppa fp
        if res then pure True else do
          log Error "Raise signal for graceful shutdown."
          embed @m $ liftIO $ raiseSignal sigTERM
          pure False
      _ -> do
        log Error $ "Error sending file " <> pack fp <> ". Raise signal for graceful shutdown."
        embed @m $ liftIO $ raiseSignal sigTERM
        pure False
    )
...

更新 2 :從配置中刪除冪等過濾器后(@KA Buhr 的更改仍然存在)memory 消耗是恆定的。


type FileFilter = FilePath -> IO Bool

createIdempotentFilter :: LogAction IO Message -> M.Idempotent -> IO FileFilter
createIdempotentFilter la filterConfig = do
    cache <- newIORef []
    let configuredCacheSize :: Int = fromIntegral $ M.lruCacheSize filterConfig
    pure $ \path -> do
        fileModificationEpoch <- getModificationTime path
        cache' <- readIORef cache
        if (path, fileModificationEpoch) `elem` cache' then do
            la <& logText Debug ("File already in cache " <> pack path <> " | " <> pack (show fileModificationEpoch))
            pure False
        else do
            la <& logText Debug ("File not in cache " <> pack path <> " | " <> pack (show fileModificationEpoch))
            let alreadyScanned' = cache' <> [(path, fileModificationEpoch)]
            writeIORef cache $ drop (length alreadyScanned' - configuredCacheSize) alreadyScanned'
            pure True

首先,確保您排除了文件內容的ByteString作為泄漏源。 您將擁有等於有界隊列長度的最大運行文件數,因此您的高水位線將是來自輸入文件系統的 100 個文件的任意集合的內容。 如果您正在處理具有大型視頻/圖像文件的文件系統,您可能會看到不穩定的瞬態峰值。 此外,如果您的回調持有對(部分或全部)這些文件的路徑名和/或內容的引用,您將看到非常嚴重的空間泄漏。 通過將readFile entry替換為return mempty並使用 null 回調(\_ _ _ _ -> return True)來排除所有這些。

在自己進行了類似的更改后,我能夠復制您的空間泄漏並將其歸結為兩個技術問題。

第一個是:

.| takeWhileC id .| mapM_C (const $ pure ())

將其替換為:

.| Control.Monad.void andC

將通過測試文件系統的單次傳遞的最大駐留從 130MB 減少到 15MB,但在堆配置文件上的堆使用量仍然具有特征性的線性增加。

第二個是:

yield item *> sourceQueue queue

將其替換為:

yield item >> sourceQueue queue

完全消除了泄漏。 最大駐留空間僅為 2MB,並且多次通過測試文件系統的堆配置文件沒有明顯的泄漏。

對於任何一個問題,我都不確定這里發生了什么。 *>>>問題是我以前見過的問題。 雖然這些在語義上是等價的,但它們不一定具有相同的實現,有時*>會泄漏>>沒有的空間。 然而, takeWhileC問題對我來說是個謎。

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM