首页 > 解决方案 > Sagemaker aws 中的自动超参数调整无法运行

问题描述

我正在使用 SageMaker 训练我的模型。为了获得更好的结果,我正在尝试运行自动超参数调整不使用此方法的训练运行得很好并给出了所需的结果,但是一旦我尝试使用这种方法运行它,它就会给出类似于训练作业中的以下错误的错误(num_filters 和 learning_rate 变化):

algorithmerror: ExecuteUserScriptError: Command "/usr/bin/python3 script_unet.py --batch_size 54 --learning_rate 0.0002596573898074083
--model_dir s3://sagemaker-us-east-2-6713267672/tensorflow-training-2020-07-04-10-02-56-198/model/tensorflow-training-200704-1002-002-b7291d39/model --num_filters 46"

我已经尝试了许多其他批量大小,以确保它不是内存问题并且它总是给出相同的错误,所以我想它不是。我需要一个 h5 模型扩展来在外部使用它,这就是为什么我将这些行保存到一个名为 models-pfe 的存储桶中。

我正在使用的模型脚本如下:

#Dependencies:
import argparse, os
import numpy as np

import tensorflow as tf
from keras import backend as K
from keras.models import Model, load_model
from keras.layers import Input
from keras.layers.core import Dropout, Lambda
from keras.layers.convolutional import Conv2D, Conv2DTranspose
from keras.layers.pooling import MaxPooling2D
from keras.layers.merge import concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras.optimizers import Adam
from keras.utils import multi_gpu_model
import boto3
from botocore.exceptions import NoCredentialsError

print("All the dependencies imported")


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    
    parser.add_argument('--epochs', type=int, default=60)
    parser.add_argument('--num_filters', type=int, default=32)
    parser.add_argument('--learning_rate', type=float, default=0.0001)
    parser.add_argument('--batch_size', type=int, default=64)
    parser.add_argument('--model_dir', type=str, default='s3://model-pfe')
    parser.add_argument('--training', type=str, default=os.environ['SM_CHANNEL_TRAINING'])
    parser.add_argument('--testing', type=str, default=os.environ['SM_CHANNEL_TESTING'])
    parser.add_argument('--access_key', type=str)
    parser.add_argument('--secret_key', type=str)



    args, _ = parser.parse_known_args()

    epochs       = args.epochs
    num_filters  = args.num_filters
    lr           = args.learning_rate
    batch_size   = args.batch_size
    model_dir    = args.model_dir
    training_dir = args.training
    testing_dir  = args.testing
    access_key  = args.access_key
    secret_key  = args.secret_key



    X_train = np.load(os.path.join(training_dir, 'training.npz'))['image']
    Y_train = np.load(os.path.join(training_dir, 'training.npz'))['label']
    X_test  = np.load(os.path.join(testing_dir, 'testing.npz'))['image']
    Y_test  = np.load(os.path.join(testing_dir, 'testing.npz'))['label']

    # input image dimensions
    img_rows, img_cols = 512,512

    # Tensorflow needs image channels last, e.g. (batch size, width, height, channels)
    K.set_image_data_format('channels_last')
    print(K.image_data_format())



    print('X_train shape:', X_train.shape)
    print(X_train.shape[0], 'train samples')
    print(X_test.shape[0], 'test samples')

    # Normalize pixel values
    X_train   = X_train.astype('float32')
    X_test    = X_test.astype('float32')
    X_train  /= 255
    X_test   /= 255


    # U-Net model
    inputs = Input((512, 512, 3))
    c1 = Conv2D(num_filters, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (inputs)
    c1 = Dropout(0.1) (c1)
    c1 = Conv2D(num_filters, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c1)
    p1 = MaxPooling2D((2, 2)) (c1)

    c2 = Conv2D(num_filters*2, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (p1)
    c2 = Dropout(0.1) (c2)
    c2 = Conv2D(num_filters*2, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c2)
    p2 = MaxPooling2D((2, 2)) (c2)

    c3 = Conv2D(num_filters*4, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (p2)
    c3 = Dropout(0.2) (c3)
    c3 = Conv2D(num_filters*4, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c3)
    p3 = MaxPooling2D((2, 2)) (c3)

    c4 = Conv2D(num_filters*8, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (p3)
    c4 = Dropout(0.2) (c4)
    c4 = Conv2D(num_filters*8, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c4)
    p4 = MaxPooling2D(pool_size=(2, 2)) (c4)

    c5 = Conv2D(num_filters*16, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (p4)
    c5 = Dropout(0.3) (c5)
    c5 = Conv2D(num_filters*16, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c5)

    u6 = Conv2DTranspose(num_filters*8, (2, 2), strides=(2, 2), padding='same') (c5)
    u6 = concatenate([u6, c4])
    c6 = Conv2D(num_filters*8, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (u6)
    c6 = Dropout(0.2) (c6)
    c6 = Conv2D(num_filters*8, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c6)

    u7 = Conv2DTranspose(num_filters*4, (2, 2), strides=(2, 2), padding='same') (c6)
    u7 = concatenate([u7, c3])
    c7 = Conv2D(num_filters*4, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (u7)
    c7 = Dropout(0.2) (c7)
    c7 = Conv2D(num_filters*4, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c7)

    u8 = Conv2DTranspose(num_filters*2, (2, 2), strides=(2, 2), padding='same') (c7)
    u8 = concatenate([u8, c2])
    c8 = Conv2D(num_filters*2, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (u8)
    c8 = Dropout(0.1) (c8)
    c8 = Conv2D(num_filters*2, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c8)

    u9 = Conv2DTranspose(num_filters, (2, 2), strides=(2, 2), padding='same') (c8)
    u9 = concatenate([u9, c1], axis=3)
    c9 = Conv2D(num_filters, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (u9)
    c9 = Dropout(0.1) (c9)
    c9 = Conv2D(num_filters, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c9)

    outputs = Conv2D(1, (1, 1), activation='sigmoid') (c9)
    model = Model(inputs=[inputs], outputs=[outputs])
    print(model.summary())

    # Use GPUs (for ml.p2.8xlarge = 8 GPUs)
    model = multi_gpu_model(model, gpus=8)

    model.compile(optimizer=Adam(lr=lr), loss='binary_crossentropy', metrics=['accuracy'])
    
    #Fit model
    results = model.fit(X_train, Y_train,
                        validation_data=(X_test, Y_test),
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=1,
                        shuffle=True)

    
    # Validation evaluation
    score= model.evaluate(X_test, Y_test)
    print('Validation loss    :', score[0])
    print('Validation accuracy:', score[1])

    s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key)
    
    def upload_to_aws(local_file, bucket, s3_file):
        try:
            s3.upload_file(local_file, bucket, s3_file)
            print("Upload Successful")
            return True
        except FileNotFoundError:
            print("The file was not found")
            return False
        except NoCredentialsError:
            print("Credentials not available")
            return False
    model.save('model.h5')
    upload_to_aws('model.h5','models-pfe',"model.h5")

并在自动超参数调整上运行此脚本,我使用以下脚本:

import sagemaker
sess = sagemaker.Session()
role = sagemaker.get_execution_role()

#My data location in s3
training_input_path="s3://sagemaker-us-east-2-6713267672/pfe-unet/training/training.npz"
validation_input_path="s3://sagemaker-us-east-2-6713267672/pfe-unet/validation/testing.npz"


from sagemaker.tensorflow import TensorFlow

tf_estimator = TensorFlow(entry_point='script_unet.py', 
                          role=role,
                          train_instance_count=1, 
                          train_instance_type='ml.p2.8xlarge',
                          framework_version='1.12', 
                          py_version='py3',
                          script_mode=True,
                          hyperparameters={
                              'epochs': 60,
                              'batch_size': 32, 
                              'access_key'   : '',
                              'secret_key'   : ''}
                         )
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

hyperparameter_ranges = {
    'num_filters'  : IntegerParameter(32,64),
    'learning_rate': ContinuousParameter(0.0001, 0.005)}

objective_metric_name = 'loss'
objective_type = 'Minimize'
metric_definitions = [{'Name': 'loss','Regex': 'loss = ([0-9\\.]+)'}]

tuner = HyperparameterTuner(tf_estimator,
                            objective_metric_name,
                            hyperparameter_ranges,
                            metric_definitions,
                            max_jobs=6,
                            max_parallel_jobs=1,
                            objective_type=objective_type,
                            early_stopping_type='Auto')

tuner.fit({'training': training_input_path, 'validation': validation_input_path})

出于安全考虑,我更改了存储桶名称、密钥和访问密钥

标签: amazon-web-servicesamazon-s3amazon-ec2deep-learningamazon-sagemaker

解决方案


推荐阅读