AWS Sagemaker On-Demand Training

This simple example will train for 2 epochs in SageMaker.

Setup a minimal python environment

virtualenv --python=/usr/bin/python3.6 .venv
source .venv/bin/activate
pip install sagemaker==2.24.5

Setup

Download the latest docker image mbari/kclassify

docker pull mbari/kclassify

Push that to your ECR repository

docker login -u AWS -p $(aws ecr get-login-password --region us-west-2) 123456789012.dkr.ecr.us-west-2.amazonaws.com
docker push mbari/kclassify

Run

Alert

Check for image before starting to make sure the image is available in your account, e.g.

$ aws ecr describe-repositories --query "repositories[].repositoryName" --output text --region us-west-2*

In this example the image kclassify:1.0.1 is used.

The example below stores the required training data and other artifacts that are generated during training in the following organization:

~~~
│   └── your bucket/your prefix
│       └── training/ 
│                   └──train_stats.json (optional)
│                   └──train.tar.gz 
│                   └──val.tar.gz
│       └── checkpoint/  (optional)
...
│                   └──0f110283-1d0d-41ed-a336-b997bfec0658/
│                   └──1c264240-62b4-4342-9f31-80b6a5d69b14/

Available options for hyperparameters can be found here

import boto3
import botocore
import datetime as dt
import json
import uuid
import sagemaker
from sagemaker.estimator import Estimator
from pathlib import Path

#################################################################################################
# Setup default locations, model parameters and needed globals
#################################################################################################
# For s3 operations
s3_client = boto3.client('s3') 
s3_resource = boto3.resource('s3')
# client for sagemaker operations
sagemaker_session = sagemaker.Session() 
# This is set to the IAM role in SageMaker configured for the VAA project. If you are running this outside of a sagemaker notebook, you must set the role
role = 'arn:aws:iam::872338704006:role/service-role/AmazonSageMaker-ExecutionRole-20201012T164265'
# Region to run this in
region = 'us-west-2'
# Docker name in ECR of the training image
image_uri = '872338704006.dkr.ecr.us-west-2.amazonaws.com/kclassify-v1.0.1'
# The root path your training data is in locally
training_path = Path.cwd() / 'data' 
# The root bucket to store your training data and models in.
bucket = f'902005-videolab-test-sagemaker' 
# This can be anything you want - just a placeholder to separate this data from other training jobs in the bucket
prefix = 'test512x512' 
# Training data location
training_channel = prefix + '/training'
s3_train_data = f's3://{bucket}/{training_channel}'
# location to store checkpoints between jobs; optional - use if you want to run this again after a training job completes with the previous checkpoints
checkpoint_s3_bucket = f's3://{bucket}/{prefix}/checkpoint/{uuid.uuid4()}'

print(f'Model output {bucket_path}')
print(f'Training data in: {s3_train_data}')

#################################################################################################
# Configure tags. These are optional, but useful for later cost accounting and clean-up.
#################################################################################################
user = getpass.getuser() # this will grab the system user
deletion_date = (dt.datetime.utcnow() + dt.timedelta(days=90)).strftime('%Y%m%dT%H%M%SZ')
tag_dict = [{'Key': 'mbari:project-number', 'Value': '902005'},
        {'Key': 'mbari:owner', 'Value': user},
        {'Key': 'mbari:description', 'Value': 'test kclassify training'},
        {'Key': 'mbari:customer-project', 'Value': '902005'},
        {'Key': 'mbari:stage', 'Value': 'test'},
        {'Key': 'mbari:application', 'Value': 'detection'},
        {'Key': 'mbari:deletion-date', 'Value': deletion_date},
        {'Key': 'mbari:created-by', 'Value': user}]

#################################################################################################
# Create and tag the bucket to the bucket (only need to do this once)
#################################################################################################
try:
    response = s3_client.create_bucket(Bucket=bucket, CreateBucketConfiguration={'LocationConstraint': region },)
    print(response)
    # latest exceptions https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html
except botocore.exceptions.ClientError as error:
    if 'BucketAlreadyOwnedByYou' not in str(error):
        raise error
try:
    bucket_tagging = s3_resource.BucketTagging(bucket)
    bucket_tagging.put(Tagging={'TagSet': tag_dict})
except Exception as error:
    raise error


#################################################################################################
# Upload the data  (only need to do this once, unless the data has changed)
#################################################################################################
print(f'Uploading data to {bucket}...')
for file in training_path.glob('*.*'): 
    sagemaker_session.upload_data(path=f'{training_path}/{file.name}', bucket=bucket, key_prefix=training_channel)
print('Done')

#################################################################################################
# Define the metrics to log
#################################################################################################
metric_definitions = [{'Name': 'validation_accuracy:', 'Regex': 'validation_accuracy = ([0-9.]+)'},
                      {'Name': 'validation_loss', 'Regex': 'validation_loss = ([0-9.]+)'},
                     {'Name': 'best_val_categorical_accuracy', 'Regex': 'best_val_categorical_accuracy = ([0-9.]+)'}]
print(metric_definitions) 

#################################################################################################
# Run training job
#################################################################################################
estimator = Estimator(base_job_name='bluewhale-a-efnetb0',
                       role=role,
                       tags=tag_dict,
                       image_uri=image_uri,
                       volume_size = 10,
                       enable_sagemaker_metrics=True,
                       instance_count=1,
                       instance_type='ml.p2.xlarge',
                       sagemaker_session=sagemaker_session,
                       input_mode= 'File',
                       metric_definitions=metric_definitions,
                       hyperparameters={
                           'epochs': 10,
                           'early_stop': True,
                           'horizontal_flip': True,
                           'vertical_flip': False,
                           'batch_size': 64,
                           'optimizer': 'adam',
                           'base_model': 'efficientnetB0',
                           'train_stats': '/opt/ml/input/data/training/train_stats.json',
                           'train': '/opt/ml/input/data/training/train.tar.gz',  # these must match the training files in the bucket an must be in /opt/ml/input/data/training as specified in the bucket prefix
                           'eval': '/opt/ml/input/data/training/val.tar.gz',
                           'saved-model-dir': '/opt/ml/model/1' # this must be in /opt/ml/model /opt/ml/model/1 puts in a version 1 of the model
                        })


train_data = sagemaker.inputs.TrainingInput(f'{s3_train_data}', distribution='FullyReplicated', content_type='text/plain', s3_data_type='S3Prefix')
data = {'training': train_data}
# Finally, run!
estimator.fit(inputs=data)