首页 > 解决方案 > 带有 HDFS 工件存储的 MLFLOW

问题描述

我需要一些帮助来配置将 hdfs 设置为 mlflow 的工件存储。我有 mlflow 和 hdfs 都在一个案卷网络的单独容器中运行。当我尝试记录模型时,出现以下错误:

FileNotFoundError                         Traceback (most recent call last)
<ipython-input-35-e54b25688d8e> in <module>
      1 # log model artifacts
----> 2 pyfunc.log_model('hdfs://hdfs:8020/', python_model=LGBWrapper(), artifacts=artifacts, conda_env=conda_env)
      3 # pyfunc.save_model('prediction_model8', python_model=LGBWrapper(), artifacts=artifacts, conda_env=conda_env)
      4 
      5 # set tag for selecting model

~/opt/anaconda3/envs/soptai/lib/python3.6/site-packages/mlflow/pyfunc/__init__.py in log_model(artifact_path, loader_module, data_path, code_path, conda_env, python_model, artifacts, registered_model_name)
    697                      artifacts=artifacts,
    698                      conda_env=conda_env,
--> 699                      registered_model_name=registered_model_name)
    700 
    701 

~/opt/anaconda3/envs/soptai/lib/python3.6/site-packages/mlflow/models/__init__.py in log(cls, artifact_path, flavor, registered_model_name, **kwargs)
    100             mlflow_model = cls(artifact_path=artifact_path, run_id=run_id)
    101             flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)
--> 102             mlflow.tracking.fluent.log_artifacts(local_path, artifact_path)
    103             try:
    104                 mlflow.tracking.fluent._record_logged_model(mlflow_model)

~/opt/anaconda3/envs/soptai/lib/python3.6/site-packages/mlflow/tracking/fluent.py in log_artifacts(local_dir, artifact_path)
    321     """
    322     run_id = _get_or_start_run().info.run_id
--> 323     MlflowClient().log_artifacts(run_id, local_dir, artifact_path)
    324 
    325 

~/opt/anaconda3/envs/soptai/lib/python3.6/site-packages/mlflow/tracking/client.py in log_artifacts(self, run_id, local_dir, artifact_path)
    265         :param artifact_path: If provided, the directory in ``artifact_uri`` to write to.
    266         """
--> 267         self._tracking_client.log_artifacts(run_id, local_dir, artifact_path)
    268 
    269     def _record_logged_model(self, run_id, mlflow_model):

~/opt/anaconda3/envs/soptai/lib/python3.6/site-packages/mlflow/tracking/_tracking_service/client.py in log_artifacts(self, run_id, local_dir, artifact_path)
    266         run = self.get_run(run_id)
    267         artifact_repo = get_artifact_repository(run.info.artifact_uri)
--> 268         artifact_repo.log_artifacts(local_dir, artifact_path)
    269 
    270     def list_artifacts(self, run_id, path=None):

~/opt/anaconda3/envs/soptai/lib/python3.6/site-packages/mlflow/store/artifact/hdfs_artifact_repo.py in log_artifacts(self, local_dir, artifact_path)
     47         hdfs_base_path = _resolve_base_path(self.path, artifact_path)
     48 
---> 49         with hdfs_system(host=self.host, port=self.port) as hdfs:
     50 
     51             if not hdfs.exists(hdfs_base_path):

~/opt/anaconda3/envs/soptai/lib/python3.6/contextlib.py in __enter__(self)
     79     def __enter__(self):
     80         try:
---> 81             return next(self.gen)
     82         except StopIteration:
     83             raise RuntimeError("generator didn't yield") from None

~/opt/anaconda3/envs/soptai/lib/python3.6/site-packages/mlflow/store/artifact/hdfs_artifact_repo.py in hdfs_system(host, port)
    175                                 driver=driver,
    176                                 kerb_ticket=kerb_ticket,
--> 177                                 extra_conf=extra_conf)
    178     yield connected
    179     connected.close()

~/opt/anaconda3/envs/soptai/lib/python3.6/site-packages/pyarrow/hdfs.py in connect(host, port, user, kerb_ticket, driver, extra_conf)
    213     fs = HadoopFileSystem(host=host, port=port, user=user,
    214                           kerb_ticket=kerb_ticket, driver=driver,
--> 215                           extra_conf=extra_conf)
    216     return fs

~/opt/anaconda3/envs/soptai/lib/python3.6/site-packages/pyarrow/hdfs.py in __init__(self, host, port, user, kerb_ticket, driver, extra_conf)
     36                  driver='libhdfs', extra_conf=None):
     37         if driver == 'libhdfs':
---> 38             _maybe_set_hadoop_classpath()
     39 
     40         self._connect(host, port, user, kerb_ticket, driver, extra_conf)

~/opt/anaconda3/envs/soptai/lib/python3.6/site-packages/pyarrow/hdfs.py in _maybe_set_hadoop_classpath()
    138             classpath = _hadoop_classpath_glob(hadoop_bin)
    139     else:
--> 140         classpath = _hadoop_classpath_glob('hadoop')
    141 
    142     os.environ['CLASSPATH'] = classpath.decode('utf-8')

~/opt/anaconda3/envs/soptai/lib/python3.6/site-packages/pyarrow/hdfs.py in _hadoop_classpath_glob(hadoop_bin)
    163 
    164     hadoop_classpath_args = (hadoop_bin, 'classpath', '--glob')
--> 165     return subprocess.check_output(hadoop_classpath_args)
    166 
    167 

~/opt/anaconda3/envs/soptai/lib/python3.6/subprocess.py in check_output(timeout, *popenargs, **kwargs)
    354 
    355     return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
--> 356                **kwargs).stdout
    357 
    358 

~/opt/anaconda3/envs/soptai/lib/python3.6/subprocess.py in run(input, timeout, check, *popenargs, **kwargs)
    421         kwargs['stdin'] = PIPE
    422 
--> 423     with Popen(*popenargs, **kwargs) as process:
    424         try:
    425             stdout, stderr = process.communicate(input, timeout=timeout)

~/opt/anaconda3/envs/soptai/lib/python3.6/subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors)
    727                                 c2pread, c2pwrite,
    728                                 errread, errwrite,
--> 729                                 restore_signals, start_new_session)
    730         except:
    731             # Cleanup if the child failed starting.

~/opt/anaconda3/envs/soptai/lib/python3.6/subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)
   1362                         if errno_num == errno.ENOENT:
   1363                             err_msg += ': ' + repr(err_filename)
-> 1364                     raise child_exception_type(errno_num, err_msg, err_filename)
   1365                 raise child_exception_type(err_msg)
   1366 

FileNotFoundError: [Errno 2] No such file or directory: 'hadoop': 'hadoop'

访问 hdfs 不是问题,因为它们在同一个网络中,并且在同一网络上运行的其他服务也可以访问 hdfs。正如报告类似问题的人所建议的那样( https://github.com/mlflow/mlflow/issues/1466),可能需要对 core-site.xml 或 hdfs-site.xml 进行一些更改。不幸的是,我不知道这些改变需要是什么。请协助!

标签: hdfsmlflow

解决方案


推荐阅读