amazon-web-services - Sagemaker aws 中的自动超参数调整无法运行
问题描述
我正在使用 SageMaker 训练我的模型。为了获得更好的结果,我正在尝试运行自动超参数调整。不使用此方法的训练运行得很好并给出了所需的结果,但是一旦我尝试使用这种方法运行它,它就会给出类似于训练作业中的以下错误的错误(num_filters 和 learning_rate 变化):
algorithmerror: ExecuteUserScriptError: Command "/usr/bin/python3 script_unet.py --batch_size 54 --learning_rate 0.0002596573898074083
--model_dir s3://sagemaker-us-east-2-6713267672/tensorflow-training-2020-07-04-10-02-56-198/model/tensorflow-training-200704-1002-002-b7291d39/model --num_filters 46"
我已经尝试了许多其他批量大小,以确保它不是内存问题并且它总是给出相同的错误,所以我想它不是。我需要一个 h5 模型扩展来在外部使用它,这就是为什么我将这些行保存到一个名为 models-pfe 的存储桶中。
我正在使用的模型脚本如下:
#Dependencies:
import argparse, os
import numpy as np
import tensorflow as tf
from keras import backend as K
from keras.models import Model, load_model
from keras.layers import Input
from keras.layers.core import Dropout, Lambda
from keras.layers.convolutional import Conv2D, Conv2DTranspose
from keras.layers.pooling import MaxPooling2D
from keras.layers.merge import concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras.optimizers import Adam
from keras.utils import multi_gpu_model
import boto3
from botocore.exceptions import NoCredentialsError
print("All the dependencies imported")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--epochs', type=int, default=60)
parser.add_argument('--num_filters', type=int, default=32)
parser.add_argument('--learning_rate', type=float, default=0.0001)
parser.add_argument('--batch_size', type=int, default=64)
parser.add_argument('--model_dir', type=str, default='s3://model-pfe')
parser.add_argument('--training', type=str, default=os.environ['SM_CHANNEL_TRAINING'])
parser.add_argument('--testing', type=str, default=os.environ['SM_CHANNEL_TESTING'])
parser.add_argument('--access_key', type=str)
parser.add_argument('--secret_key', type=str)
args, _ = parser.parse_known_args()
epochs = args.epochs
num_filters = args.num_filters
lr = args.learning_rate
batch_size = args.batch_size
model_dir = args.model_dir
training_dir = args.training
testing_dir = args.testing
access_key = args.access_key
secret_key = args.secret_key
X_train = np.load(os.path.join(training_dir, 'training.npz'))['image']
Y_train = np.load(os.path.join(training_dir, 'training.npz'))['label']
X_test = np.load(os.path.join(testing_dir, 'testing.npz'))['image']
Y_test = np.load(os.path.join(testing_dir, 'testing.npz'))['label']
# input image dimensions
img_rows, img_cols = 512,512
# Tensorflow needs image channels last, e.g. (batch size, width, height, channels)
K.set_image_data_format('channels_last')
print(K.image_data_format())
print('X_train shape:', X_train.shape)
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')
# Normalize pixel values
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255
# U-Net model
inputs = Input((512, 512, 3))
c1 = Conv2D(num_filters, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (inputs)
c1 = Dropout(0.1) (c1)
c1 = Conv2D(num_filters, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c1)
p1 = MaxPooling2D((2, 2)) (c1)
c2 = Conv2D(num_filters*2, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (p1)
c2 = Dropout(0.1) (c2)
c2 = Conv2D(num_filters*2, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c2)
p2 = MaxPooling2D((2, 2)) (c2)
c3 = Conv2D(num_filters*4, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (p2)
c3 = Dropout(0.2) (c3)
c3 = Conv2D(num_filters*4, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c3)
p3 = MaxPooling2D((2, 2)) (c3)
c4 = Conv2D(num_filters*8, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (p3)
c4 = Dropout(0.2) (c4)
c4 = Conv2D(num_filters*8, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c4)
p4 = MaxPooling2D(pool_size=(2, 2)) (c4)
c5 = Conv2D(num_filters*16, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (p4)
c5 = Dropout(0.3) (c5)
c5 = Conv2D(num_filters*16, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c5)
u6 = Conv2DTranspose(num_filters*8, (2, 2), strides=(2, 2), padding='same') (c5)
u6 = concatenate([u6, c4])
c6 = Conv2D(num_filters*8, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (u6)
c6 = Dropout(0.2) (c6)
c6 = Conv2D(num_filters*8, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c6)
u7 = Conv2DTranspose(num_filters*4, (2, 2), strides=(2, 2), padding='same') (c6)
u7 = concatenate([u7, c3])
c7 = Conv2D(num_filters*4, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (u7)
c7 = Dropout(0.2) (c7)
c7 = Conv2D(num_filters*4, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c7)
u8 = Conv2DTranspose(num_filters*2, (2, 2), strides=(2, 2), padding='same') (c7)
u8 = concatenate([u8, c2])
c8 = Conv2D(num_filters*2, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (u8)
c8 = Dropout(0.1) (c8)
c8 = Conv2D(num_filters*2, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c8)
u9 = Conv2DTranspose(num_filters, (2, 2), strides=(2, 2), padding='same') (c8)
u9 = concatenate([u9, c1], axis=3)
c9 = Conv2D(num_filters, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (u9)
c9 = Dropout(0.1) (c9)
c9 = Conv2D(num_filters, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c9)
outputs = Conv2D(1, (1, 1), activation='sigmoid') (c9)
model = Model(inputs=[inputs], outputs=[outputs])
print(model.summary())
# Use GPUs (for ml.p2.8xlarge = 8 GPUs)
model = multi_gpu_model(model, gpus=8)
model.compile(optimizer=Adam(lr=lr), loss='binary_crossentropy', metrics=['accuracy'])
#Fit model
results = model.fit(X_train, Y_train,
validation_data=(X_test, Y_test),
batch_size=batch_size,
epochs=epochs,
verbose=1,
shuffle=True)
# Validation evaluation
score= model.evaluate(X_test, Y_test)
print('Validation loss :', score[0])
print('Validation accuracy:', score[1])
s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key)
def upload_to_aws(local_file, bucket, s3_file):
try:
s3.upload_file(local_file, bucket, s3_file)
print("Upload Successful")
return True
except FileNotFoundError:
print("The file was not found")
return False
except NoCredentialsError:
print("Credentials not available")
return False
model.save('model.h5')
upload_to_aws('model.h5','models-pfe',"model.h5")
并在自动超参数调整上运行此脚本,我使用以下脚本:
import sagemaker
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
#My data location in s3
training_input_path="s3://sagemaker-us-east-2-6713267672/pfe-unet/training/training.npz"
validation_input_path="s3://sagemaker-us-east-2-6713267672/pfe-unet/validation/testing.npz"
from sagemaker.tensorflow import TensorFlow
tf_estimator = TensorFlow(entry_point='script_unet.py',
role=role,
train_instance_count=1,
train_instance_type='ml.p2.8xlarge',
framework_version='1.12',
py_version='py3',
script_mode=True,
hyperparameters={
'epochs': 60,
'batch_size': 32,
'access_key' : '',
'secret_key' : ''}
)
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
hyperparameter_ranges = {
'num_filters' : IntegerParameter(32,64),
'learning_rate': ContinuousParameter(0.0001, 0.005)}
objective_metric_name = 'loss'
objective_type = 'Minimize'
metric_definitions = [{'Name': 'loss','Regex': 'loss = ([0-9\\.]+)'}]
tuner = HyperparameterTuner(tf_estimator,
objective_metric_name,
hyperparameter_ranges,
metric_definitions,
max_jobs=6,
max_parallel_jobs=1,
objective_type=objective_type,
early_stopping_type='Auto')
tuner.fit({'training': training_input_path, 'validation': validation_input_path})
出于安全考虑,我更改了存储桶名称、密钥和访问密钥
解决方案
推荐阅读
- algorithm - 在解决约束问题时,有没有比完全排列更好的 Multiple 算法?
- ios - Cordova 提交到应用商店时出现“Apple 将停止接受使用 UIWebView 的应用提交”警告
- php - 如何使用 maatexcel 库为导出到 excel 文件的每一列制作边框?
- java - Elasticsearch 更新文档而不创建新索引
- angular - 想了解@ViewChild() 的工作原理
- r - 创建简单的 grob 布局
- linux - 正在运行的进程上的 greps 管道不输出任何内容
- laravel - 对带有左连接的查询应用过滤
- javascript - Scroll is not working for ie 11 sharepoint 2013
- multithreading - How to stop child threads if keyboard exception occurs in python?