[英]I've had trouble training a model in AWS SageMaker, everything is fine until the model needs to be saved
我在 AWS SageMaker 中訓練 model 時遇到了麻煩,在需要保存 model 之前一切都很好。 我嘗試使用 500MB 數據集,一切正常,但是當 .csv 文件占用 10GB 時,訓練作業失敗。 接下來我離開我的訓練 python 文件和錯誤 output,用於訓練的機器是 ml.m5.2xlarge,train_volume_size = 100。
使用.py文件在 SageMaker 中訓練 model,output 為 10GB
import argparse
import pandas as pd
import os
import sys
from os.path import join
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
import numpy as np
import logging
import boto3
import time
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))
if 'SAGEMAKER_METRICS_DIRECTORY' in os.environ:
log_file_handler = logging.FileHandler(join(os.environ['SAGEMAKER_METRICS_DIRECTORY'], "metrics.json"))
log_file_handler.setFormatter(
"{'time':'%(asctime)s', 'name': '%(name)s', \
'level': '%(levelname)s', 'message': '%(message)s'}"
)
logger.addHandler(log_file_handler)
os.system('pip install joblib')
import joblib
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# Adicion de hyperparametros
# Solamente se añade el parametro lambda de regularizacion
parser.add_argument('--regularization_lambda',type=float, default=0.0)
# Argumentos propios de sagemaker
parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
args = parser.parse_args()
input_files = [ os.path.join(args.train, file) for file in os.listdir(args.train) ]
if len(input_files) == 0:
raise ValueError(('There are no files in {}.\n' +
'This usually indicates that the channel ({}) was incorrectly specified,\n' +
'the data specification in S3 was incorrectly specified or the role specified\n' +
'does not have permission to access the data.').format(args.train, "train"))
raw_data = [pd.read_csv(file,header=None,engine="python") for file in input_files]
train_data = pd.concat(raw_data)
# Definicion del modelo
model = GaussianNB()
matrix = train_data.values
for submatrix in np.split(matrix,np.arange(100,12100,100),axis=0):
# Generacion de los datos de entrenemiento asumiendo que
# las etiquetas estan en la primera columna
train_y = submatrix[:,0]
train_x = submatrix[:,1:]
model = model.partial_fit(train_x,train_y,classes=np.unique(train_y))
print('Accuracy: ', model.score(train_x, train_y))
logger.info('Train accuracy: {:.6f};'.format(model.score(train_x, train_y)))
# Mustra de los coeficientes y guradarlos
joblib.dump(model, os.path.join(args.model_dir, "model.joblib"))
def model_fn(model_dir):
# Se retorna el modelo entrenado
model = joblib.load(os.path.join(model_dir, "model.joblib"))
return model
當完成 output 是下一個錯誤
2020-07-20 09:49:52 Starting - Starting the training job...
2020-07-20 09:49:54 Starting - Launching requested ML instances......
2020-07-20 09:50:58 Starting - Preparing the instances for training...
2020-07-20 09:51:39 Downloading - Downloading input data...............
2020-07-20 09:54:22 Training - Training image download completed. Training in progress..2020-07-20 09:54:24,234 sagemaker-containers INFO Imported framework sagemaker_sklearn_container.training
2020-07-20 09:54:24,236 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)
2020-07-20 09:54:24,246 sagemaker_sklearn_container.training INFO Invoking user training script.
2020-07-20 09:54:24,803 sagemaker-containers INFO Module eeg-NB-model does not provide a setup.py.
Generating setup.py
2020-07-20 09:54:24,803 sagemaker-containers INFO Generating setup.cfg
2020-07-20 09:54:24,803 sagemaker-containers INFO Generating MANIFEST.in
2020-07-20 09:54:24,803 sagemaker-containers INFO Installing module with the following command:
/miniconda3/bin/python -m pip install .
Processing /opt/ml/code
Building wheels for collected packages: eeg-NB-model
Building wheel for eeg-NB-model (setup.py): started
Building wheel for eeg-NB-model (setup.py): finished with status 'done'
Created wheel for eeg-NB-model: filename=eeg_NB_model-1.0.0-py2.py3-none-any.whl size=7074 sha256=2d6213105e4f7f707f68278b1291d2940b8de2c319f7084b322b2d4197402c33
Stored in directory: /tmp/pip-ephem-wheel-cache-8kr3fxjv/wheels/35/24/16/37574d11bf9bde50616c67372a334f94fa8356bc7164af8ca3
Successfully built eeg-NB-model
Installing collected packages: eeg-NB-model
Successfully installed eeg-NB-model-1.0.0
2020-07-20 09:54:26,753 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)
2020-07-20 09:54:26,763 sagemaker-containers INFO Invoking user script
Training Env:
{
"additional_framework_parameters": {},
"channel_input_dirs": {
"train": "/opt/ml/input/data/train"
},
"current_host": "algo-1",
"framework_module": "sagemaker_sklearn_container.training:main",
"hosts": [
"algo-1"
],
"hyperparameters": {
"regularization_lambda": 0.0
},
"input_config_dir": "/opt/ml/input/config",
"input_data_config": {
"train": {
"TrainingInputMode": "File",
"S3DistributionType": "FullyReplicated",
"RecordWrapperType": "None"
}
},
"input_dir": "/opt/ml/input",
"is_master": true,
"job_name": "sagemaker-scikit-learn-2020-07-20-09-49-52-390",
"log_level": 20,
"master_hostname": "algo-1",
"model_dir": "/opt/ml/model",
"module_dir": "s3://sagemaker-eu-west-1-798663412819/sagemaker-scikit-learn-2020-07-20-09-49-52-390/source/sourcedir.tar.gz",
"module_name": "eeg-NB-model",
"network_interface_name": "eth0",
"num_cpus": 8,
"num_gpus": 0,
"output_data_dir": "/opt/ml/output/data",
"output_dir": "/opt/ml/output",
"output_intermediate_dir": "/opt/ml/output/intermediate",
"resource_config": {
"current_host": "algo-1",
"hosts": [
"algo-1"
],
"network_interface_name": "eth0"
},
"user_entry_point": "eeg-NB-model.py"
}
Environment variables:
SM_HOSTS=["algo-1"]
SM_NETWORK_INTERFACE_NAME=eth0
SM_HPS={"regularization_lambda":0.0}
SM_USER_ENTRY_POINT=eeg-NB-model.py
SM_FRAMEWORK_PARAMS={}
SM_RESOURCE_CONFIG={"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"}
SM_INPUT_DATA_CONFIG={"train":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}}
SM_OUTPUT_DATA_DIR=/opt/ml/output/data
SM_CHANNELS=["train"]
SM_CURRENT_HOST=algo-1
SM_MODULE_NAME=eeg-NB-model
SM_LOG_LEVEL=20
SM_FRAMEWORK_MODULE=sagemaker_sklearn_container.training:main
SM_INPUT_DIR=/opt/ml/input
SM_INPUT_CONFIG_DIR=/opt/ml/input/config
SM_OUTPUT_DIR=/opt/ml/output
SM_NUM_CPUS=8
SM_NUM_GPUS=0
SM_MODEL_DIR=/opt/ml/model
SM_MODULE_DIR=s3://sagemaker-eu-west-1-798663412819/sagemaker-scikit-learn-2020-07-20-09-49-52-390/source/sourcedir.tar.gz
SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{"train":"/opt/ml/input/data/train"},"current_host":"algo-1","framework_module":"sagemaker_sklearn_container.training:main","hosts":["algo-1"],"hyperparameters":{"regularization_lambda":0.0},"input_config_dir":"/opt/ml/input/config","input_data_config":{"train":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}},"input_dir":"/opt/ml/input","is_master":true,"job_name":"sagemaker-scikit-learn-2020-07-20-09-49-52-390","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-eu-west-1-798663412819/sagemaker-scikit-learn-2020-07-20-09-49-52-390/source/sourcedir.tar.gz","module_name":"eeg-NB-model","network_interface_name":"eth0","num_cpus":8,"num_gpus":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"},"user_entry_point":"eeg-NB-model.py"}
SM_USER_ARGS=["--regularization_lambda","0.0"]
SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate
SM_CHANNEL_TRAIN=/opt/ml/input/data/train
SM_HP_REGULARIZATION_LAMBDA=0.0
PYTHONPATH=/miniconda3/bin:/miniconda3/lib/python37.zip:/miniconda3/lib/python3.7:/miniconda3/lib/python3.7/lib-dynload:/miniconda3/lib/python3.7/site-packages
Invoking script with the following command:
/miniconda3/bin/python -m eeg-NB-model --regularization_lambda 0.0
/miniconda3/lib/python3.7/site-packages/sklearn/externals/joblib/externals/cloudpickle/cloudpickle.py:47: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses
import imp
Collecting joblib
Downloading https://files.pythonhosted.org/packages/51/dd/0e015051b4a27ec5a58b02ab774059f3289a94b0906f880a3f9507e74f38/joblib-0.16.0-py3-none-any.whl (300kB)
Installing collected packages: joblib
Successfully installed joblib-0.16.0
2020-07-20 09:58:21 Uploading - Uploading generated training model
2020-07-20 09:58:21 Failed - Training job failed
2020-07-20 09:58:12,544 sagemaker-containers ERROR ExecuteUserScriptError:
Command "/miniconda3/bin/python -m eeg-NB-model --regularization_lambda 0.0"
---------------------------------------------------------------------------
UnexpectedStatusException Traceback (most recent call last)
<ipython-input-7-267e445b3bf0> in <module>
28 NB_training_job_name = "Naive-Bayes-training-job-{}".format(int(time.time()))
29
---> 30 estimator.fit({'train': train_input},wait=True)
/opt/conda/lib/python3.6/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name, experiment_config)
463 self.jobs.append(self.latest_training_job)
464 if wait:
--> 465 self.latest_training_job.wait(logs=logs)
466
467 def _compilation_job_name(self):
/opt/conda/lib/python3.6/site-packages/sagemaker/estimator.py in wait(self, logs)
1056 # If logs are requested, call logs_for_jobs.
1057 if logs != "None":
-> 1058 self.sagemaker_session.logs_for_job(self.job_name, wait=True, log_type=logs)
1059 else:
1060 self.sagemaker_session.wait_for_job(self.job_name)
/opt/conda/lib/python3.6/site-packages/sagemaker/session.py in logs_for_job(self, job_name, wait, poll, log_type)
3019
3020 if wait:
-> 3021 self._check_job_status(job_name, description, "TrainingJobStatus")
3022 if dot:
3023 print()
/opt/conda/lib/python3.6/site-packages/sagemaker/session.py in _check_job_status(self, job, desc, status_key_name)
2613 ),
2614 allowed_statuses=["Completed", "Stopped"],
-> 2615 actual_status=status,
2616 )
2617
UnexpectedStatusException: Error for Training job sagemaker-scikit-learn-2020-07-20-09-49-52-390: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/miniconda3/bin/python -m eeg-NB-model --regularization_lambda 0.0"
但是,看起來您正在將所有數據加載到 memory 中,如下所示:
raw_data = [pd.read_csv(file,header=None,engine="python") for file in input_files]
train_data = pd.concat(raw_data)
您使用的 model 類型ml.m5.2xlarge
具有32 GiB 的 memory 。 以這種方式將所有數據加載到 memory 可能會導致內存不足異常或超時。 查看 SageMaker / Cloudwatch 日志以嘗試獲取失敗原因。 不幸的是,SageMaker 日志僅顯示ExecuteUserScriptError
並不能告訴您太多信息,但在其他情況下,此錯誤代碼沒有詳細信息是由於資源錯誤造成的。
對此進行測試的一種方法是將 sagemaker 實例的大小增加到具有更大 memory 的實例。
或者,您可以避免一次將所有訓練數據加載到 memory 中。 看起來您的輸入 CSV 數據已經拆分為文件。 您是否考慮過在所有這些文件上編寫一個循環來逐個訓練它們? 這樣您就不必一次將所有功能存儲在 memory 中。
for file in input_files:
raw_data_block = pd.read_csv(file,header=None,engine="python")
# training code for raw_data_block here.
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.