![](/img/trans.png)
[英]Error in Python when importing sklearn ..cannot import name 'Logger' from 'joblib.logger'
[英]Catch sklearn joblib output to python logging
使用 sklearn 时,我想查看 output。因此,我在可用时使用verbose
。
通常,我需要时间戳、进程 ID 等,所以我尽可能使用 python logging
模块。 之前已经完成了将sklearn output获取到logging
模块,例如https://stackoverflow.com/a/50803365
但是,我想并行运行,joblib 也直接使用sys.stdout
和sys.stderr
。 因此,我的尝试(见下文)不起作用。
import logging
import sys
import contextlib
class LogAdapter:
def __init__(self,level,logger) -> None:
if level == 'INFO':
self.report = logger.info
elif level == 'ERROR':
self.report = logger.error
def write(self,msg):
stripped = msg.rstrip()
if len(stripped) > 0:
self.report(stripped)
def flush(self):
pass
@contextlib.contextmanager
def redirect_to_log(logger):
originals = sys.stdout, sys.stderr
sys.stdout = LogAdapter(level='INFO',logger=logger)
sys.stderr = LogAdapter(level='ERROR',logger=logger)
yield
sys.stdout, sys.stderr = originals
def test_case():
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import parallel_backend
logger = logging.getLogger(__name__)
logging.basicConfig(
level=logging.DEBUG,
format="%(process)d | %(asctime)s | %(name)14s | %(levelname)7s | %(message)s",
)
for backend_name in ['loky','threading']:
logger.info(f"Testing backend {backend_name}")
with parallel_backend(backend_name),redirect_to_log(logger):
clf = RandomForestClassifier(2, verbose=4)
X = [[0, 0], [1, 1]]
Y = [0, 1]
clf = clf.fit(X, Y)
if __name__ == "__main__":
test_case()
我得到 output
19320 | 2022-11-30 17:49:16,938 | __main__ | INFO | Testing backend loky
19320 | 2022-11-30 17:49:16,951 | __main__ | ERROR | [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
building tree 1 of 2
building tree 2 of 2
19320 | 2022-11-30 17:49:18,923 | __main__ | ERROR | [Parallel(n_jobs=-1)]: Done 2 out of 2 | elapsed: 1.9s remaining: 0.0s
19320 | 2022-11-30 17:49:18,923 | __main__ | ERROR | [Parallel(n_jobs=-1)]: Done 2 out of 2 | elapsed: 1.9s finished
19320 | 2022-11-30 17:49:18,924 | __main__ | INFO | Testing backend threading
19320 | 2022-11-30 17:49:18,925 | __main__ | ERROR | [Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
19320 | 2022-11-30 17:49:18,932 | __main__ | INFO | building tree 1 of 2
19320 | 2022-11-30 17:49:18,932 | __main__ | INFO | building tree 2 of 2
19320 | 2022-11-30 17:49:18,934 | __main__ | ERROR | [Parallel(n_jobs=-1)]: Done 2 out of 2 | elapsed: 0.0s remaining: 0.0s
19320 | 2022-11-30 17:49:18,934 | __main__ | ERROR | [Parallel(n_jobs=-1)]: Done 2 out of 2 | elapsed: 0.0s finished
如您所见,它可以很好地与threading
后端配合使用,但不适用于loky
后端。 Loky 用于多处理,我的上下文管理器只在主进程中捕获stdout
和stderr
。 如何捕获子进程的标准输出并将它们放入标准 python 日志记录中?
如果它是一个普通的 python 子进程调用,我可以通过提供管道来捕获 IO,如https://codereview.stackexchange.com/questions/6567/redirecting-subprocesses-output-stdout-and-stderr-to-the-logging -模块
我意识到,其他人在我之前用 loky 尝试过但失败了。 一种选择是确保将“设置日志记录”调用附加到通过 joblib 推送的每个作业。 这可能行得通,但据我所知,sklearn 并没有公开那种程度的细节。 参见例如https://github.com/joblib/joblib/issues/1017
我确实想出了一个使用dask
后端的解决方法。 我定义了一个 worker 插件,它本质上是我的 contextmanager
import dask.distributed
class LogPlugin(dask.distributed.WorkerPlugin):
name = "LoggerRedirector"
def setup(self, worker: dask.distributed.Worker):
self.originals = sys.stdout, sys.stderr
init_logging()
sys.stdout = LogAdapter(level='INFO',logger=logging.getLogger(__name__))
sys.stderr = LogAdapter(level='ERROR',logger=logging.getLogger(__name__))
def teardown(self, worker: dask.distributed.Worker):
sys.stdout, sys.stderr = self.originals
然后在 dask 后端注册它
client = dask.distributed.Client()
client.register_worker_plugin(LogPlugin())
我现在可以通过多处理获得所需的 output。
这是一个可以接受的解决方案,但有点烦人,因为 dask 的开销比 loky 大,并且对我强加了新的依赖性。
新的完整代码是:
import logging
import sys
import contextlib
class LogAdapter:
def __init__(self,level,logger) -> None:
if level == 'INFO':
self.report = logger.info
elif level == 'ERROR':
self.report = logger.error
def write(self,msg):
stripped = msg.rstrip()
if len(stripped) > 0:
self.report(stripped)
def flush(self):
pass
@contextlib.contextmanager
def redirect_to_log(logger):
originals = sys.stdout, sys.stderr
sys.stdout = LogAdapter(level='INFO',logger=logger)
sys.stderr = LogAdapter(level='ERROR',logger=logger)
yield
sys.stdout, sys.stderr = originals
def init_logging():
logging.basicConfig(
level=logging.DEBUG,
format="%(process)d | %(asctime)s | %(name)14s | %(levelname)7s | %(message)s",
)
import dask.distributed
class LogPlugin(dask.distributed.WorkerPlugin):
name = "LoggerRedirector"
def setup(self, worker: dask.distributed.Worker):
self.originals = sys.stdout, sys.stderr
init_logging()
sys.stdout = LogAdapter(level='INFO',logger=logging.getLogger(__name__))
sys.stderr = LogAdapter(level='ERROR',logger=logging.getLogger(__name__))
def teardown(self, worker: dask.distributed.Worker):
sys.stdout, sys.stderr = self.originals
def test_case():
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import parallel_backend
client = dask.distributed.Client()
client.register_worker_plugin(LogPlugin())
logger = logging.getLogger(__name__)
init_logging()
for backend_name in ['loky','threading','dask']:
logger.info(f"Testing backend {backend_name}")
with parallel_backend(backend_name),redirect_to_log(logger):
clf = RandomForestClassifier(2, verbose=4)
X = [[0, 0], [1, 1]]
Y = [0, 1]
clf = clf.fit(X, Y)
if __name__ == "__main__":
test_case()
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.