來自管道的 Azure ML 輸出

Question

我正在嘗試在 Microsoft Azure 中構建一個管道，在輸入中有（目前）一個簡單的 python 腳本。 問題是我找不到我的輸出。 在我的筆記本部分，我構建了以下兩個代碼：

1) 名為“test.ipynb”的腳本

# azureml-core of version 1.0.72 or higher is required
from azureml.core import Workspace, Dataset, Datastore
import pandas as pd
import numpy as np
import datetime
import math

#Upload datasets
subscription_id = 'myid'
resource_group = 'myrg'
workspace_name = 'mywn'
workspace = Workspace(subscription_id, resource_group, workspace_name)
dataset_zre = Dataset.get_by_name(workspace, name='file1')
dataset_SLA = Dataset.get_by_name(workspace, name='file2')
df_zre = dataset_zre.to_pandas_dataframe()
df_SLA = dataset_SLA.to_pandas_dataframe()
result = pd.concat([df_SLA,df_zre], sort=True)
result.to_csv(path_or_buf="/mnt/azmnt/code/Users/aniello.spiezia/outputs/output.csv",index=False)

def_data_store = workspace.get_default_datastore()
def_data_store.upload(src_dir = '/mnt/azmnt/code/Users/aniello.spiezia/outputs', target_path = '/mnt/azmnt/code/Users/aniello.spiezia/outputs', overwrite = True)

print("\nFinished!")
#End of the file

2）管道代碼稱為“pipeline.ipynb”

import os
import pandas as pd
import json
import azureml.core
from azureml.core import Workspace, Run, Experiment, Datastore
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.runconfig import CondaDependencies, RunConfiguration
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azureml.telemetry import set_diagnostics_collection
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import Pipeline, PipelineData, StepSequence
print("SDK Version:", azureml.core.VERSION)

###############################
ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')
experiment_name =  'aml-pipeline-cicd' # choose a name for experiment
project_folder = '.' # project folder
experiment = Experiment(ws, experiment_name)
print("Location:", ws.location)
set_diagnostics_collection(send_diagnostics=True)

###############################
cd = CondaDependencies.create(pip_packages=["azureml-sdk==1.0.17", "azureml-train-automl==1.0.17", "pyculiarity", "pytictoc", "cryptography==2.5", "pandas"])
amlcompute_run_config = RunConfiguration(framework = "python", conda_dependencies = cd)
amlcompute_run_config.environment.docker.enabled = False
amlcompute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
amlcompute_run_config.environment.spark.precache_packages = False

###############################
aml_compute_target = "aml-compute"
try:
    aml_compute = AmlCompute(ws, aml_compute_target)
    print("found existing compute target.")
except:
    print("creating new compute target")

    provisioning_config = AmlCompute.provisioning_configuration(vm_size = "STANDARD_D2_V2", 
                                                                idle_seconds_before_scaledown=1800, 
                                                                min_nodes = 0, 
                                                                max_nodes = 4)
    aml_compute = ComputeTarget.create(ws, aml_compute_target, provisioning_config)
    aml_compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
print("Azure Machine Learning Compute attached")

###############################
def_data_store = ws.get_default_datastore()
def_blob_store = Datastore(ws, "workspaceblobstore")
print("Blobstore's name: {}".format(def_blob_store.name))
# Naming the intermediate data as anomaly data and assigning it to a variable
output_data = PipelineData("output_data", datastore = def_blob_store)
print("output_data object created")
step = PythonScriptStep(name = "test",
                        script_name = "test.ipynb",
                        compute_target = aml_compute, 
                        source_directory = project_folder,
                        allow_reuse = True,
                        runconfig = amlcompute_run_config)
print("Step created.")

###############################
steps = [step]
print("Step lists created")
pipeline = Pipeline(workspace = ws, steps = steps)
print ("Pipeline is built")
pipeline.validate()
print("Pipeline validation complete")
pipeline_run = experiment.submit(pipeline)
print("Pipeline is submitted for execution")
pipeline_run.wait_for_completion(show_output = False)
print("Pipeline run completed")

###############################
def_data_store.download(target_path = '.',
                        prefix = 'outputs',
                        show_progress = True,
                        overwrite = True)
model_fname = 'output.csv'
model_path = os.path.join("outputs", model_fname)
pipeline_run.upload_file(name = model_path, path_or_stream = model_path)
print('Uploaded the model {} to experiment {}'.format(model_fname, pipeline_run.experiment.name))

這給了我以下錯誤：

Pipeline run completed
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-22-a8a523969bb3> in <module>
    111 
    112 # Upload the model file explicitly into artifacts (for CI/CD)
--> 113 pipeline_run.upload_file(name = model_path, path_or_stream = model_path)
    114 print('Uploaded the model {} to experiment {}'.format(model_fname, pipeline_run.experiment.name))
    115 

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/core/run.py in wrapped(self, *args, **kwargs)
     47                                      "therefore, the {} cannot upload files, or log file backed metrics.".format(
     48                                          self, self.__class__.__name__))
---> 49         return func(self, *args, **kwargs)
     50     return wrapped
     51 

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/core/run.py in upload_file(self, name, path_or_stream)
   1749         :rtype: azure.storage.blob.models.ResourceProperties
   1750         """
-> 1751         return self._client.artifacts.upload_artifact(path_or_stream, RUN_ORIGIN, self._container, name)
   1752 
   1753     @_check_for_data_container_id

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/_restclient/artifacts_client.py in upload_artifact(self, artifact, *args, **kwargs)
    108         if isinstance(artifact, str):
    109             self._logger.debug("Uploading path artifact")
--> 110             return self.upload_artifact_from_path(artifact, *args, **kwargs)
    111         elif isinstance(artifact, IOBase):
    112             self._logger.debug("Uploading io artifact")

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/_restclient/artifacts_client.py in upload_artifact_from_path(self, path, *args, **kwargs)
    100         path = os.path.normpath(path)
    101         path = os.path.abspath(path)
--> 102         with open(path, "rb") as stream:
    103             return self.upload_artifact_from_stream(stream, *args, **kwargs)
    104 

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/azmnt/code/Users/aniello.spiezia/outputs/output.csv'

你知道可能是什么問題嗎？ 特別是我有興趣將名為“output.csv”的輸出文件保存在某處

Answer 1

執行此操作的最佳方法在一定程度上取決於您希望在運行完成后如何處理 output.csv 文件。 但是，通常您可以將 csv 寫入 ./outputs 文件夾：

# azureml-core of version 1.0.72 or higher is required
from azureml.core import Workspace, Dataset, Datastore
import pandas as pd
import numpy as np
import datetime
import math

#Upload datasets
subscription_id = 'myid'
resource_group = 'myrg'
workspace_name = 'mywn'
workspace = Workspace(subscription_id, resource_group, workspace_name)
dataset_zre = Dataset.get_by_name(workspace, name='file1')
dataset_SLA = Dataset.get_by_name(workspace, name='file2')
df_zre = dataset_zre.to_pandas_dataframe()
df_SLA = dataset_SLA.to_pandas_dataframe()
result = pd.concat([df_SLA,df_zre], sort=True)

if not os.path.isdir('outputs')
    os.mkdir('outputs')
result.to_csv('outputs/output.csv', index=False)

print("\nFinished!")
#End of the file

運行完成后，AzureML 會將輸出目錄的內容上傳到運行歷史記錄，因此不需要datastore.upload() 。

之后，您可以在http://ml.azure.com 中看到該文件，就像我的model.pt文件一樣導航到下面的運行：

有關 ./outputs 和 ./logs 文件夾的一些信息，請參見此處： https ://docs.microsoft.com/en-us/azure/machine-learning/how-to-save-write-experiment-files#where- 寫入文件

如果您確實想在運行后創建另一個數據集，請在此處查看此帖子： Azure 機器學習服務 - 數據集 API 問題

Answer 2

在上面 Daniel 的示例中，您需要下載運行的輸出，而不是 pipeline.ipynb 代碼中的數據存儲。 您可以調用 pipeline_run.download('outputs/output.csv', '.')，而不是調用 def_data_store.download()。

另一種選擇是使用 PipelineData 輸出數據。 PipelineData 表示管道步驟的命名輸出部分，如果您想將多個步驟與輸入和輸出連接在一起，則非常有用。 使用 PipelineData，您需要在聲明步驟時將 PipelineData 對象傳遞到 PythonScriptStep（作為參數=[] 和輸出=[] 的一部分），然后讓腳本從命令行參數讀取輸出路徑。

此筆記本包含在管道中使用 PipelineData 並下載輸出的示例： https : //github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines /aml-pipelines-with-data-dependency-steps.ipynb

這篇博文詳細介紹了如何在腳本中處理此問題（解析命令行參數、創建輸出目錄和編寫輸出文件）： https : //blog.x5ff.xyz/blog/ai-azureml-蟒蛇數據管道/

來自管道的 Azure ML 輸出

問題描述

2 個解決方案

解決方案1
3 2020-01-31 03:46:38

解決方案2
0 2020-01-31 17:41:21

來自管道的 Azure ML 輸出

問題描述

2 個解決方案

解決方案1 3 2020-01-31 03:46:38

解決方案2 0 2020-01-31 17:41:21

解決方案1
3 2020-01-31 03:46:38

解決方案2
0 2020-01-31 17:41:21