diff --git a/workflows/tf-object-detection-training-tf2/defaults.json b/workflows/tf-object-detection-training-tf2/defaults.json new file mode 100644 index 00000000..d3c3b45d --- /dev/null +++ b/workflows/tf-object-detection-training-tf2/defaults.json @@ -0,0 +1,295 @@ +{ + "frcnn-res50-coco": { + "num_clones": 1, + "batch_size": 1, + "num_steps": 10000, + "num_classes": 90, + "initial_learning_rate": 0.0003, + "warmup_learning_rate": 0.0, + "warmup_steps": 5000, + "momentum_optimizer_value": 0.9, + "min_dimension": 800, + "max_dimension": 1333, + "height_stride": 16, + "width_stride": 16, + "first_stage_regularizer_weight": 0.0, + "first_stage_initializer_stddev": 0.01, + "first_stage_nms_score_threshold": 0.0, + "first_stage_nms_iou_threshold": 0.7, + "first_stage_max_proposals": 300, + "first_stage_localization_loss_weight": 2.0, + "first_stage_objectness_loss_weight": 1.0, + "initial_crop_size": 14, + "maxpool_kernel_size": 2, + "maxpool_stride": 2, + "second_stage_regularizer_weight": 0.0, + "second_stage_initializer_factor": 1.0, + "second_stage_initializer_mode": 2, + "second_stage_use_dropout": false, + "second_stage_dropout_keep_probability": 1.0, + "second_stage_nms_score_threshold": 0.0, + "second_stage_nms_iou_threshold": 0.6, + "second_stage_max_detections_per_class": 100, + "second_stage_max_detections_max_total_detections": 100, + "second_stage_localization_loss_weight": 2.0, + "second_stage_classification_loss_weight": 1.0 + }, + "frcnn-res101-coco": { + "num_clones": 1, + "batch_size": 1, + "num_steps": 10000, + "num_classes": 90, + "initial_learning_rate": 0.0003, + "warmup_learning_rate": 0.0, + "warmup_steps": 5000, + "momentum_optimizer_value": 0.9, + "min_dimension": 800, + "max_dimension": 1333, + "first_stage_features_stride": 16, + "height_stride": 16, + "width_stride": 16, + "first_stage_regularizer_weight": 0.0, + "first_stage_initializer_stddev": 0.01, + "first_stage_nms_score_threshold": 0.0, + "first_stage_nms_iou_threshold": 0.7, + "first_stage_max_proposals": 20, + "first_stage_localization_loss_weight": 2.0, + "first_stage_objectness_loss_weight": 1.0, + "initial_crop_size": 14, + "maxpool_kernel_size": 2, + "maxpool_stride": 2, + "second_stage_regularizer_weight": 0.0, + "second_stage_initializer_factor": 1.0, + "second_stage_initializer_mode": 2, + "second_stage_use_dropout": false, + "second_stage_dropout_keep_probability": 1.0, + "second_stage_nms_score_threshold": 0.3, + "second_stage_nms_iou_threshold": 0.6, + "second_stage_max_detections_per_class": 100, + "second_stage_max_detections_max_total_detections": 100, + "second_stage_localization_loss_weight": 2.0, + "second_stage_classification_loss_weight": 1.0 + }, + "frcnn-res101-low": { + "num_clones": 1, + "batch_size": 1, + "num_steps": 10, + "num_classes": 90, + "initial_learning_rate": 0.0003, + "warmup_learning_rate": 0.0, + "warmup_steps": 5000, + "momentum_optimizer_value": 0.9, + "min_dimension": 600, + "max_dimension": 1024, + "first_stage_features_stride": 16, + "height_stride": 16, + "width_stride": 16, + "first_stage_regularizer_weight": 0.0, + "first_stage_initializer_stddev": 0.01, + "first_stage_nms_score_threshold": 0.0, + "first_stage_nms_iou_threshold": 0.7, + "first_stage_max_proposals": 20, + "first_stage_localization_loss_weight": 2.0, + "first_stage_objectness_loss_weight": 1.0, + "initial_crop_size": 14, + "maxpool_kernel_size": 2, + "maxpool_stride": 2, + "second_stage_regularizer_weight": 0.0, + "second_stage_initializer_factor": 1.0, + "second_stage_initializer_mode": 2, + "second_stage_use_dropout": false, + "second_stage_dropout_keep_probability": 1.0, + "second_stage_nms_score_threshold": 0.3, + "second_stage_nms_iou_threshold": 0.6, + "second_stage_max_detections_per_class": 20, + "second_stage_max_detections_max_total_detections": 20, + "second_stage_localization_loss_weight": 2.0, + "second_stage_classification_loss_weight": 1.0 + }, + "frcnn-nas-coco": { + "num_clones": 1, + "batch_size": 1, + "num_steps": 10000, + "num_classes": 90, + "initial_learning_rate": 0.0003, + "warmup_learning_rate": 0.0, + "warmup_steps": 5000, + "momentum_optimizer_value": 0.9, + "min_dimension": 600, + "max_dimension": 1024, + "first_stage_features_stride": 16, + "height_stride": 16, + "width_stride": 16, + "first_stage_regularizer_weight": 0.0, + "first_stage_initializer_stddev": 0.01, + "first_stage_nms_score_threshold": 0.0, + "first_stage_nms_iou_threshold": 0.7, + "first_stage_max_proposals": 20, + "first_stage_localization_loss_weight": 2.0, + "first_stage_objectness_loss_weight": 1.0, + "initial_crop_size": 14, + "maxpool_kernel_size": 2, + "maxpool_stride": 2, + "second_stage_regularizer_weight": 0.0, + "second_stage_initializer_factor": 1.0, + "second_stage_initializer_mode": 2, + "second_stage_use_dropout": false, + "second_stage_dropout_keep_probability": 1.0, + "second_stage_nms_score_threshold": 0.3, + "second_stage_nms_iou_threshold": 0.6, + "second_stage_max_detections_per_class": 20, + "second_stage_max_detections_max_total_detections": 20, + "second_stage_localization_loss_weight": 2.0, + "second_stage_classification_loss_weight": 1.0 + }, + "ssd-mobilenet-v1-coco2": { + "num_clones": 1, + "batch_size": 24, + "num_steps": 15000, + "num_classes": 90, + "initial_learning_rate": 0.004, + "decay_steps": 800720, + "decay_factor": 0.95, + "momentum_decay": 0.9, + "momentum_epsilon": 1.0, + "momentum_optimizer_value": 0.9, + "image_height": 300, + "image_width": 300, + "depth_multiplier": 1.0, + "min_depth": 16, + "first_stage_regularizer_weight": 0.00004, + "first_stage_initializer_mean": 0.0, + "first_stage_initializer_stddev": 0.03, + "first_stage_activation": 2, + "first_stage_batchnorm_decay": 0.9997, + "first_stage_batchnorm_epsilon": 0.001, + "boxcoder_y_scale": 10.0, + "boxcoder_x_scale": 10.0, + "boxcoder_height_scale": 5.0, + "boxcoder_width_scale": 5.0, + "matched_threshold": 0.5, + "unmatched_threshold": 0.5, + "second_stage_regularizer_weight": 0.00004, + "second_stage_initializer_mean": 0.0, + "second_stage_initializer_stddev": 0.03, + "second_stage_activation": 2, + "second_stage_batchnorm_decay": 0.9997, + "second_stage_batchnorm_epsilon": 0.001, + "second_stage_min_depth": 0, + "second_stage_max_depth": 0, + "second_stage_num_layers_before_predictor": 0, + "second_stage_use_dropout": false, + "second_stage_dropout_keep_probability": 0.8, + "second_stage_dropout_kernel_size": 3, + "second_stage_dropout_box_code_size": 4, + "anchor_generator_num_layers": 6, + "anchor_generator_min_scale": 0.2, + "anchor_generator_max_scale": 0.95, + "second_stage_nms_score_threshold": 0.3, + "second_stage_nms_iou_threshold": 0.6, + "second_stage_max_detections_per_class": 100, + "second_stage_max_detections_max_total_detections": 100, + "second_stage_localization_loss_weight": 1.0, + "second_stage_classification_loss_weight": 1.0 + }, + "ssd-mobilenet-v2-coco": { + "num_clones": 1, + "batch_size": 24, + "num_steps": 15000, + "num_classes": 90, + "initial_learning_rate": 0.004, + "decay_steps": 800720, + "decay_factor": 0.95, + "momentum_decay": 0.9, + "momentum_epsilon": 1.0, + "momentum_optimizer_value": 0.9, + "image_height": 300, + "image_width": 300, + "depth_multiplier": 1.0, + "min_depth": 16, + "first_stage_regularizer_weight": 0.00004, + "first_stage_initializer_mean": 0.0, + "first_stage_initializer_stddev": 0.03, + "first_stage_activation": 2, + "first_stage_batchnorm_decay": 0.9997, + "first_stage_batchnorm_epsilon": 0.001, + "boxcoder_y_scale": 10.0, + "boxcoder_x_scale": 10.0, + "boxcoder_height_scale": 5.0, + "boxcoder_width_scale": 5.0, + "matched_threshold": 0.5, + "unmatched_threshold": 0.5, + "second_stage_regularizer_weight": 0.00004, + "second_stage_initializer_mean": 0.0, + "second_stage_initializer_stddev": 0.03, + "second_stage_activation": 2, + "second_stage_batchnorm_decay": 0.9997, + "second_stage_batchnorm_epsilon": 0.001, + "second_stage_min_depth": 0, + "second_stage_max_depth": 0, + "second_stage_num_layers_before_predictor": 0, + "second_stage_use_dropout": false, + "second_stage_dropout_keep_probability": 0.8, + "second_stage_dropout_kernel_size": 3, + "second_stage_dropout_box_code_size": 4, + "anchor_generator_num_layers": 6, + "anchor_generator_min_scale": 0.2, + "anchor_generator_max_scale": 0.95, + "second_stage_nms_score_threshold": 0.3, + "second_stage_nms_iou_threshold": 0.6, + "second_stage_max_detections_per_class": 100, + "second_stage_max_detections_max_total_detections": 100, + "second_stage_localization_loss_weight": 1.0, + "second_stage_classification_loss_weight": 1.0 + }, + "ssdlite-mobilenet-coco": { + "num_clones": 1, + "batch_size": 24, + "num_steps": 1000, + "num_classes": 90, + "initial_learning_rate": 0.004, + "decay_steps": 800720, + "decay_factor": 0.95, + "momentum_decay": 0.9, + "momentum_epsilon": 0.000003, + "momentum_optimizer_value": 1.0, + "image_height": 300, + "image_width": 300, + "depth_multiplier": 1.0, + "min_depth": 16, + "first_stage_regularizer_weight": 0.00004, + "first_stage_initializer_mean": 0.0, + "first_stage_initializer_stddev": 0.03, + "first_stage_activation": 2, + "first_stage_batchnorm_decay": 0.9997, + "first_stage_batchnorm_epsilon": 0.001, + "boxcoder_y_scale": 10.0, + "boxcoder_x_scale": 10.0, + "boxcoder_height_scale": 5.0, + "boxcoder_width_scale": 5.0, + "matched_threshold": 0.5, + "unmatched_threshold": 0.5, + "second_stage_regularizer_weight": 0.00004, + "second_stage_initializer_mean": 0.0, + "second_stage_initializer_stddev": 0.03, + "second_stage_activation": 2, + "second_stage_batchnorm_decay": 0.9997, + "second_stage_batchnorm_epsilon": 0.001, + "second_stage_min_depth": 0, + "second_stage_max_depth": 0, + "second_stage_num_layers_before_predictor": 0, + "second_stage_use_dropout": false, + "second_stage_dropout_keep_probability": 0.8, + "second_stage_dropout_kernel_size": 3, + "second_stage_dropout_box_code_size": 4, + "anchor_generator_num_layers": 6, + "anchor_generator_min_scale": 0.2, + "anchor_generator_max_scale": 0.95, + "second_stage_nms_score_threshold": 0.3, + "second_stage_nms_iou_threshold": 0.6, + "second_stage_max_detections_per_class": 100, + "second_stage_max_detections_max_total_detections": 100, + "second_stage_localization_loss_weight": 1.0, + "second_stage_classification_loss_weight": 1.0 + } + } \ No newline at end of file diff --git a/workflows/tf-object-detection-training-tf2/main.py b/workflows/tf-object-detection-training-tf2/main.py new file mode 100644 index 00000000..0252a46b --- /dev/null +++ b/workflows/tf-object-detection-training-tf2/main.py @@ -0,0 +1,86 @@ +import sys +import os +import subprocess +import shutil +import urllib.request +import tarfile +import argparse + +from utils import convert_labels_to_csv, create_pipeline + + +def main(params): + if not os.path.isdir('/mnt/data/models'): + try: + os.remove('/mnt/data/models') + except: + pass + os.makedirs('/mnt/data/models/') + + #check if base model exists, if not then download + if params['sys_finetune_checkpoint'] == '': + print('base model does not exist, downloading...') + #urllib.request.urlretrieve('https://github.com/onepanelio/templates/releases/download/v0.2.0/{}.tar'.format(params['model']), '/mnt/data/models/model.tar') + urllib.request.urlretrieve('http://download.tensorflow.org/models/object_detection/tf2/20200711/faster_rcnn_resnet50_v1_800x1333_coco17_gpu-8.tar.gz'.format(params['model']), '/mnt/data/models/model.tar') + model_files = tarfile.open('/mnt/data/models/model.tar') + model_files.extractall('/mnt/data/models') + model_files.close() + #model_dir = '/mnt/data/models/'+params['model'] + model_dir = '/mnt/data/models/'+'faster_rcnn_resnet50_v1_800x1333_coco17_gpu-8' + files = os.listdir(model_dir) + for f in files: + shutil.move(model_dir+'/'+f,'/mnt/data/models') + + params = create_pipeline('/mnt/data/models/pipeline.config', + '/mnt/data/models/checkpoint/ckpt-0', + params['dataset']+'/label_map.pbtxt', + params['dataset']+'/*.tfrecord', + params['dataset']+'/default.tfrecord', + '/mnt/output/pipeline.config', + params) + + os.chdir('/mnt/output') + os.mkdir('eval/') + return_code = subprocess.call(['python', + '/mnt/src/tf/research/object_detection/model_main_tf2.py', + '--pipeline_config_path=/mnt/output/pipeline.config', + '--model_dir=/mnt/output/', + '--num_train_steps={}'.format(params['epochs']), + '--alsologtostderr' + ]) + if return_code != 0: + print(return_code) + raise RuntimeError('Training process failed') + return_code = subprocess.call(['python', + '/mnt/src/tf/research/object_detection/exporter_main_v2.py', + '--input_type=image_tensor', + '--pipeline_config_path=/mnt/output/pipeline.config', + '--trained_checkpoint_dir=/mnt/output/', + '--output_directory=/mnt/output/model/' + ]) + if return_code != 0: + print(return_code) + raise RuntimeError('Model export process failed') + + # generate lable map + convert_labels_to_csv(params['dataset']) + print('Training complete and output saved') + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Train TFOD.') + parser.add_argument('--dataset', default='/mnt/data/datasets') + parser.add_argument('--extras', help='hyperparameters or other configs') + parser.add_argument('--sys_finetune_checkpoint', default=' ', help='path to checkpoint') + parser.add_argument('--model', default='frcnn-res50-coco', help='which model to train') + parser.add_argument('--num_classes', default=81, type=int, help='number of classes') + args = parser.parse_args() + # parse parameters + # sample: epochs=100;num_classes=1 + print('Arguments: ', args) + extras = args.extras.split('\n') + extras_processed = [i.split('#')[0].replace(' ','') for i in extras if i] + params = {i.split('=')[0]:i.split('=')[1] for i in extras_processed} + params.update(vars(args)) + print('Processed parameters: ', params) + main(params) + \ No newline at end of file diff --git a/workflows/tf-object-detection-training-tf2/template.yaml b/workflows/tf-object-detection-training-tf2/template.yaml new file mode 100644 index 00000000..ae15d5f9 --- /dev/null +++ b/workflows/tf-object-detection-training-tf2/template.yaml @@ -0,0 +1,179 @@ +arguments: + parameters: + - name: cvat-annotation-path + value: annotation-dump/sample_dataset + displayName: Dataset path + hint: Path to annotated data (TFRecord format) in default object storage. In CVAT, this parameter will be pre-populated. + visibility: internal + + - name: cvat-output-path + value: workflow-data/output/sample_output + hint: Path to store output artifacts in default object storage (i.e s3). In CVAT, this parameter will be pre-populated. + displayName: Workflow output path + visibility: internal + + - name: cvat-model + value: frcnn-res50-coco + displayName: Model + hint: TF Detection API's model to use for training. + type: select.select + visibility: public + options: + - name: 'Faster RCNN-ResNet 101-COCO' + value: frcnn-res101-coco + - name: 'Faster RCNN-ResNet 101-Low Proposal-COCO' + value: frcnn-res101-low + - name: 'Faster RCNN-ResNet 50-COCO' + value: frcnn-res50-coco + - name: 'Faster RCNN-NAS-COCO' + value: frcnn-nas-coco + - name: 'SSD MobileNet V1-COCO' + value: ssd-mobilenet-v1-coco2 + - name: 'SSD MobileNet V2-COCO' + value: ssd-mobilenet-v2-coco + - name: 'SSDLite MobileNet-COCO' + value: ssdlite-mobilenet-coco + + - name: hyperparameters + value: |- + num-steps=10000 + displayName: Hyperparameters + visibility: public + type: textarea.textarea + hint: 'See documentation for more information on parameters.' + + - name: cvat-finetune-checkpoint + value: '' + hint: Select the last fine-tune checkpoint for this model. It may take up to 5 minutes for a recent checkpoint show here. Leave empty if this is the first time you're training this model. + displayName: Checkpoint path + visibility: public + + - name: cvat-num-classes + value: '10' + hint: Number of classes. In CVAT, this parameter will be pre-populated. + displayName: Number of classes + visibility: internal + + - name: tf-image + value: tensorflow/tensorflow:2.4.0 + type: select.select + displayName: Select TensorFlow image + visibility: public + hint: Select the GPU image if you are running on a GPU node pool + options: + - name: 'TensorFlow 2.4.0 CPU Image' + value: 'tensorflow/tensorflow:2.4.0' + - name: 'TensorFlow 2.4.0 GPU Image' + value: 'tensorflow/tensorflow:2.4.0-gpu' + + - name: dump-format + value: cvat_tfrecord + visibility: public + + - displayName: Node pool + hint: Name of node pool or group to run this workflow task + type: select.nodepool + name: sys-node-pool + value: default + visibility: public + required: true + +entrypoint: main +templates: + - dag: + tasks: + - name: train-model + template: tensorflow + name: main + - container: + args: + - | + apt-get update && \ + apt-get install -y git wget unzip libglib2.0-0 libsm6 libxext6 libxrender-dev libgl1-mesa-glx && \ + cd /mnt/src/tf/research && \ + mkdir -p /mnt/src/protoc && \ + wget -P /mnt/src/protoc https://github.com/protocolbuffers/protobuf/releases/download/v3.14.0/protoc-3.14.0-linux-x86_64.zip && \ + cd /mnt/src/protoc/ && \ + unzip protoc-3.14.0-linux-x86_64.zip && \ + cd /mnt/src/tf/research/ && \ + /mnt/src/protoc/bin/protoc object_detection/protos/*.proto --python_out=. && \ + cp object_detection/packages/tf2/setup.py . && \ + python -m pip install --use-feature=2020-resolver . && \ + cd /mnt/src/train/workflows/tf-object-detection-training-tf2 && \ + python main.py \ + --extras="{{workflow.parameters.hyperparameters}}" \ + --model="{{workflow.parameters.cvat-model}}" \ + --num_classes="{{workflow.parameters.cvat-num-classes}}" \ + --sys_finetune_checkpoint={{workflow.parameters.cvat-finetune-checkpoint}} + command: + - sh + - -c + image: '{{workflow.parameters.tf-image}}' + volumeMounts: + - mountPath: /mnt/data + name: data + - mountPath: /mnt/output + name: output + workingDir: /mnt/src + nodeSelector: + beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}' + inputs: + artifacts: + - name: data + path: /mnt/data/datasets/ + s3: + key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}' + - name: models + path: /mnt/data/models/ + optional: true + s3: + key: '{{workflow.parameters.cvat-finetune-checkpoint}}' + - git: + repo: https://github.com/tensorflow/models.git + name: src + path: /mnt/src/tf + - git: + repo: https://github.com/onepanelio/templates.git + revision: main + name: tsrc + path: /mnt/src/train + name: tensorflow + outputs: + artifacts: + - name: model + optional: true + path: /mnt/output + s3: + key: '{{workflow.namespace}}/{{workflow.parameters.cvat-output-path}}/{{workflow.name}}' + sidecars: + - name: tensorboard + image: tensorflow/tensorflow:2.3.0 + command: + - sh + - '-c' + env: + - name: ONEPANEL_INTERACTIVE_SIDECAR + value: 'true' + args: + # Read logs from /mnt/output - this directory is auto-mounted from volumeMounts + - tensorboard --logdir /mnt/output/train/ + ports: + - containerPort: 6006 + name: tensorboard +volumeClaimTemplates: + - metadata: + name: data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 200Gi + - metadata: + name: output + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 200Gi diff --git a/workflows/tf-object-detection-training-tf2/train copy.py b/workflows/tf-object-detection-training-tf2/train copy.py new file mode 100755 index 00000000..577cf1c6 --- /dev/null +++ b/workflows/tf-object-detection-training-tf2/train copy.py @@ -0,0 +1,224 @@ +import sys +import os +import subprocess +import shutil +import urllib.request +import tarfile +import argparse +import yaml +import json +import csv + +import faulthandler +faulthandler.enable() +os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2" + +from google.protobuf import text_format +import tensorflow as tf +import numpy as np + +def count_samples(dataset): + # cnt_graph = dataset.reduce(np.int64(0), lambda x, _: x + 1) + # with tf.compat.v1.Session() as sess: + # cnt = sess.run(cnt_graph) + cnt = dataset.reduce(np.int64(0), lambda x, _: x + 1) + return cnt + +def split_dataset(path, train_path, val_path, val_ratio=0.2, seed=99): + print('\nSplitting up dataset') + if (val_ratio > 1.0) or (val_ratio < 0.0): + val_ratio = 0.2 + + # Load Dataset + full_dataset = tf.data.TFRecordDataset(path) + + # Count samples + cnt = count_samples(full_dataset) + print('full dataset size: {}'.format(cnt)) + + # Split + full_dataset = full_dataset.shuffle(cnt, seed=seed) # Shuffle dataset + train_size = int(val_ratio * cnt.numpy()) + train_dataset = full_dataset.take(train_size) + val_dataset = full_dataset.skip(train_size) + + # Save train and validation datasets + # train_writer = tf.data.experimental.TFRecordWriter(train_path) + # val_writer = tf.data.experimental.TFRecordWriter(val_path) + # with tf.compat.v1.Session() as sess: + # sess.run(train_writer.write(train_dataset)) + # sess.run(val_writer.write(val_dataset)) + train_writer = tf.data.experimental.TFRecordWriter(train_path) + train_writer.write(train_dataset) + val_writer = tf.data.experimental.TFRecordWriter(val_path) + val_writer.write(val_dataset) + + # Count new dataset samples + train_cnt = count_samples(train_dataset) + val_cnt = count_samples(val_dataset) + print('train dataset size: {}'.format(train_cnt)) + print('validation dataset size: {}'.format(val_cnt)) + +def convert_labels_to_csv(path): + with open(os.path.join(path, 'label_map.pbtxt'),'r') as f: + txt = f.readlines() + print('Generating label maps file...') + csv_out = open(os.path.join('/mnt/output/', 'classes.csv'), 'w') + csv_writer = csv.writer(csv_out) + csv_writer.writerow(['labels']) + data = {} + for line in txt: + if 'id' in line: + i = str(line.split(':')[1].strip()) + data[i] = None + if 'name' in line: + n = line.split(':')[1].strip().strip("'") + csv_writer.writerow([n]) + data[i] = n + d = {'label_map': data} + with open(os.path.join('/mnt/output/', 'label_map.json'), 'w') as outfile: + json.dump(d, outfile) + print('Finished generating label maps file') + +def create_pipeline(pipeline_path, model_path, label_path, + train_tfrecord_path, eval_tfrecord_path, out_pipeline_path, model_architecture, params): + # We need to import here since pb files are built right before this function is called + from object_detection.protos import pipeline_pb2 + + pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() + with tf.io.gfile.GFile(pipeline_path, 'r') as f: + proto_str = f.read() + text_format.Merge(proto_str, pipeline_config) + if model_architecture == 'ssd': + pipeline_config.model.ssd.num_classes=int(params['num_classes']) + if 'image-height' in params: + pipeline_config.model.ssd.image_resizer.fixed_shape_resizer.height = int(params['image-height']) + if 'image-width' in params: + pipeline_config.model.ssd.image_resizer.fixed_shape_resizer.width = int(params['image-width']) + else: #faster-rcnn based models + pipeline_config.model.faster_rcnn.num_classes=int(params['num_classes']) + # pipeline_config.model.faster_rcnn.feature_extractor.type="faster_rcnn_resnet50_keras" + if int(params['num-clones']) != 1: + pipeline_config.train_config.batch_size = int(params['num-clones']) + if 'min-dimension' in params: + pipeline_config.model.faster_rcnn.image_resizer.keep_aspect_ratio_resizer.min_dimension = int(params['min-dimension']) + if 'max-dimension' in params: + pipeline_config.model.faster_rcnn.image_resizer.keep_aspect_ratio_resizer.max_dimension = int(params['max-dimension']) + if 'initial-learning-rate' in params: + pipeline_config.train_config.optimizer.momentum_optimizer.learning_rate.manual_step_learning_rate.initial_learning_rate = float(params['initial-learning-rate']) + if 'schedule-step-1' in params: + pipeline_config.train_config.optimizer.momentum_optimizer.learning_rate.manual_step_learning_rate.schedule[0].step = int(params['schedule-step-1']) + if 'schedule-step-2' in params: + pipeline_config.train_config.optimizer.momentum_optimizer.learning_rate.manual_step_learning_rate.schedule[1].step = int(params['schedule-step-2']) + + pipeline_config.train_config.fine_tune_checkpoint=model_path + pipeline_config.train_config.fine_tune_checkpoint_type="detection" + pipeline_config.train_config.batch_size = 1 + pipeline_config.train_config.num_steps=int(params['epochs']) + pipeline_config.train_input_reader.label_map_path=label_path + pipeline_config.train_input_reader.tf_record_input_reader.input_path[0]=train_tfrecord_path + + # pipeline_config.eval_config.metrics_set='coco_detection_metrics' + + pipeline_config.eval_input_reader[0].label_map_path=label_path + pipeline_config.eval_input_reader[0].tf_record_input_reader.input_path[0]=eval_tfrecord_path + + config_text = text_format.MessageToString(pipeline_config) + with tf.io.gfile.GFile(out_pipeline_path, 'wb') as f: + f.write(config_text) + +def main(params): + if not os.path.isdir('/mnt/data/models'): + try: + os.remove('/mnt/data/models') + except: + pass + os.makedirs('/mnt/data/models/') + + #check if base model exists, if not then download + if params['sys_finetune_checkpoint'] == '': + print('base model does not exist, downloading...') + urllib.request.urlretrieve('http://download.tensorflow.org/models/object_detection/tf2/20200711/faster_rcnn_resnet50_v1_800x1333_coco17_gpu-8.tar.gz'.format(params['model']), '/mnt/data/models/model.tar') + model_files = tarfile.open('/mnt/data/models/model.tar') + model_files.extractall('/mnt/data/models') + model_files.close() + model_dir = '/mnt/data/models/'+'faster_rcnn_resnet50_v1_800x1333_coco17_gpu-8' + files = os.listdir(model_dir) + for f in files: + shutil.move(model_dir+'/'+f,'/mnt/data/models') + + model_architecture = 'frcnn' + if 'num-clones' not in params: + params['num-clones'] = 1 + if 'train-val-ratio' not in params: + params['train-val-ratio'] = 0.8 + if 'num-steps' not in params: + params['num-steps'] = 10000 + if 'ssd-mobilenet-v2-coco' in params['model'] or 'ssd-mobilenet-v1-coco2' in params['model']: + if 'num-steps' not in params: + params['num-steps'] = 15000 + model_architecture = 'ssd' + elif 'frcnn-res101-low' in params['model'] or 'frcnn-nas-coco' in params['model']: + if 'num-steps' not in params: + params['num-steps'] = 10 + elif 'ssdlite-mobilenet-coco' in params['model']: + if 'num-steps' not in params: + params['num-steps'] = 10 + model_architecture = 'ssd' + params['epochs'] = params.pop('num-steps') + + split_dataset( + params['dataset']+'/default.tfrecord', + params['dataset']+'/train.tfrecord', + params['dataset']+'/validation.tfrecord', + params['train-val-ratio'] + ) + + create_pipeline('/mnt/data/models/pipeline.config', + '/mnt/data/models/checkpoint/ckpt-0', + params['dataset']+'/label_map.pbtxt', + params['dataset']+'/train.tfrecord', + params['dataset']+'/validation.tfrecord', + '/mnt/output/pipeline.config', + model_architecture, + params) + + os.chdir('/mnt/output') + os.mkdir('eval/') + os.mkdir('model/') + subprocess.call(['python', + '/mnt/src/tf/research/object_detection/model_main_tf2.py', + '--pipeline_config_path=/mnt/output/pipeline.config', + '--model_dir=/mnt/output/', + '--num_train_steps={}'.format(params['epochs']) + ]) + subprocess.call(['python', + '/mnt/src/tf/research/object_detection/exporter_main_v2.py', + '--input_type=image_tensor', + '--pipeline_config_path=/mnt/output/pipeline.config', + '--trained_checkpoint_dir=/mnt/output/', + '--output_directory=/mnt/output/model/' + ]) + + # generate lable map + convert_labels_to_csv(params['dataset']) + print('Training complete and output saved') + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Train TFOD.') + parser.add_argument('--dataset', default='/mnt/data/datasets') + parser.add_argument('--extras', help='hyperparameters or other configs') + parser.add_argument('--sys_finetune_checkpoint', default=' ', help='path to checkpoint') + parser.add_argument('--model', default='frcnn-res50-coco', help='which model to train') + parser.add_argument('--num_classes', default=81, type=int, help='number of classes') + args = parser.parse_args() + # parse parameters + # sample: epochs=100;num_classes=1 + print('Arguments: ', args) + extras = args.extras.split('\n') + extras_processed = [i.split('#')[0].replace(' ','') for i in extras if i] + params = {i.split('=')[0]:i.split('=')[1] for i in extras_processed} + params.update(vars(args)) + print('Processed parameters: ', params) + main(params) + diff --git a/workflows/tf-object-detection-training-tf2/utils.py b/workflows/tf-object-detection-training-tf2/utils.py new file mode 100644 index 00000000..cb5731ac --- /dev/null +++ b/workflows/tf-object-detection-training-tf2/utils.py @@ -0,0 +1,191 @@ + +import os +import json +import yaml +import csv +import numpy as np +import tensorflow as tf +from google.protobuf import text_format +from object_detection.protos import pipeline_pb2 + +def get_default_params(model): + with open('defaults.json', 'r') as f: + default_dict = json.load(f) + return default_dict[model].copy() + +def legacy_params_compatibility(params): + new_params = params.copy() + if "num-clones" in params: + new_params["num_clones"] = params["num-clones"] + if "batch-size" in params: + new_params["batch_size"] = params["batch-size"] + if "initial-learning-rate" in params: + new_params["initial_learning_rate"] = params["initial-learning-rate"] + if "learning_rate" in params: + new_params["initial_learning_rate"] = params["learning_rate"] + if "num-steps" in params: + new_params["num_steps"] = params["num-steps"] + if "schedule-step-1" in params: + new_params["schedule_step_1"] = params["schedule-step-1"] + if "schedule-lr-1" in params: + new_params["schedule_lr_1"] = params["schedule-lr-1"] + if "schedule-step-2" in params: + new_params["schedule_step_2"] = params["schedule-step-2"] + if "schedule-step-1" in params: + new_params["schedule_lr_2"] = params["schedule-lr-2"] + if "image-height" in params: + new_params["image_height"] = params["image-height"] + if "image-width" in params: + new_params["image_width"] = params["image-width"] + + return new_params + +def process_params(params): + model_params = get_default_params(params['model']) + params = legacy_params_compatibility(params) + + if ('ssd-mobilenet-v2-coco' in params['model'] or 'ssd-mobilenet-v1-coco2' in params['model']) or 'ssdlite-mobilenet-coco' in params['model']: + model_architecture = 'ssd' + else: + model_architecture = 'frcnn' + + for key in params.keys(): + model_params[key] = params[key] + + model_params['epochs'] = params.pop('num_steps') + model_params['epochs'] = max(int(model_params['epochs']), 2) + + if int(model_params['warmup_steps']) >= model_params['epochs']: + model_params['warmup_steps'] = int(max(float(model_params['epochs']) * 0.2, 2)) + + return model_params, model_architecture + +def convert_labels_to_csv(path): + with open(os.path.join(path, 'label_map.pbtxt'),'r') as f: + txt = f.readlines() + print('Generating label maps file...') + csv_out = open(os.path.join('/mnt/output/', 'classes.csv'), 'w') + csv_writer = csv.writer(csv_out) + csv_writer.writerow(['labels']) + data = {} + for line in txt: + if 'id' in line: + i = str(line.split(':')[1].strip()) + data[i] = None + if 'name' in line: + n = line.split(':')[1].strip().strip("'") + csv_writer.writerow([n]) + data[i] = n + d = {'label_map': data} + with open(os.path.join('/mnt/output/', 'label_map.json'), 'w') as outfile: + json.dump(d, outfile) + print('Finished generating label maps file') + +def create_pipeline(pipeline_path, model_path, label_path, + train_tfrecord_path, eval_tfrecord_path, + out_pipeline_path,params): + + model_params, model_architecture = process_params(params) + + pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() + with tf.io.gfile.GFile(pipeline_path, 'r') as f: + proto_str = f.read() + text_format.Merge(proto_str, pipeline_config) + if model_architecture == 'ssd': + pipeline_config.model.ssd.num_classes=int(model_params['num_classes']) + pipeline_config.model.ssd.image_resizer.fixed_shape_resizer.height = int(model_params['image_height']) + pipeline_config.model.ssd.image_resizer.fixed_shape_resizer.width = int(model_params['image_width']) + pipeline_config.model.ssd.feature_extractor.depth_multiplier=float(model_params['depth_multiplier']) + pipeline_config.model.ssd.feature_extractor.min_depth = int(model_params['min_depth']) + pipeline_config.model.ssd.feature_extractor.conv_hyperparams.regularizer.l2_regularizer.weight = float(model_params['first_stage_regularizer_weight']) + pipeline_config.model.ssd.feature_extractor.conv_hyperparams.initializer.truncated_normal_initializer.mean = float(model_params['first_stage_initializer_mean']) + pipeline_config.model.ssd.feature_extractor.conv_hyperparams.initializer.truncated_normal_initializer.stddev = float(model_params['first_stage_initializer_stddev']) + pipeline_config.model.ssd.feature_extractor.conv_hyperparams.activation = int(model_params['first_stage_activation']) + pipeline_config.model.ssd.feature_extractor.conv_hyperparams.batch_norm.decay = float(model_params['first_stage_batchnorm_decay']) + pipeline_config.model.ssd.feature_extractor.conv_hyperparams.batch_norm.epsilon = float(model_params['first_stage_batchnorm_epsilon']) + pipeline_config.model.ssd.box_coder.faster_rcnn_box_coder.y_scale = float(model_params['boxcoder_y_scale']) + pipeline_config.model.ssd.box_coder.faster_rcnn_box_coder.x_scale = float(model_params['boxcoder_x_scale']) + pipeline_config.model.ssd.box_coder.faster_rcnn_box_coder.height_scale = float(model_params['boxcoder_height_scale']) + pipeline_config.model.ssd.box_coder.faster_rcnn_box_coder.width_scale = float(model_params['boxcoder_width_scale']) + pipeline_config.model.ssd.matcher.argmax_matcher.matched_threshold = float(model_params['matched_threshold']) + pipeline_config.model.ssd.matcher.argmax_matcher.unmatched_threshold = float(model_params['unmatched_threshold']) + pipeline_config.model.ssd.box_predictor.convolutional_box_predictor.conv_hyperparams.regularizer.l2_regularizer.weight = float(model_params['second_stage_regularizer_weight']) + pipeline_config.model.ssd.box_predictor.convolutional_box_predictor.conv_hyperparams.initializer.truncated_normal_initializer.mean = float(model_params['second_stage_initializer_mean']) + pipeline_config.model.ssd.box_predictor.convolutional_box_predictor.conv_hyperparams.initializer.truncated_normal_initializer.stddev = float(model_params['second_stage_initializer_stddev']) + pipeline_config.model.ssd.box_predictor.convolutional_box_predictor.conv_hyperparams.activation = int(model_params['second_stage_activation']) + pipeline_config.model.ssd.box_predictor.convolutional_box_predictor.conv_hyperparams.batch_norm.decay = float(model_params['second_stage_batchnorm_decay']) + pipeline_config.model.ssd.box_predictor.convolutional_box_predictor.conv_hyperparams.batch_norm.epsilon = float(model_params['second_stage_batchnorm_epsilon']) + pipeline_config.model.ssd.box_predictor.convolutional_box_predictor.min_depth = int(model_params['second_stage_min_depth']) + pipeline_config.model.ssd.box_predictor.convolutional_box_predictor.max_depth = int(model_params['second_stage_max_depth']) + pipeline_config.model.ssd.box_predictor.convolutional_box_predictor.num_layers_before_predictor = int(model_params['second_stage_num_layers_before_predictor']) + pipeline_config.model.ssd.box_predictor.convolutional_box_predictor.use_dropout = model_params['second_stage_use_dropout'] + pipeline_config.model.ssd.box_predictor.convolutional_box_predictor.dropout_keep_probability = float(model_params['second_stage_dropout_keep_probability']) + pipeline_config.model.ssd.box_predictor.convolutional_box_predictor.kernel_size = int(model_params['second_stage_dropout_kernel_size']) + pipeline_config.model.ssd.box_predictor.convolutional_box_predictor.box_code_size = int(model_params['second_stage_dropout_box_code_size']) + pipeline_config.model.ssd.anchor_generator.ssd_anchor_generator.num_layers = int(model_params['anchor_generator_num_layers']) + pipeline_config.model.ssd.anchor_generator.ssd_anchor_generator.min_scale = float(model_params['anchor_generator_min_scale']) + pipeline_config.model.ssd.anchor_generator.ssd_anchor_generator.max_scale = float(model_params['anchor_generator_max_scale']) + pipeline_config.model.ssd.post_processing.batch_non_max_suppression.score_threshold = float(model_params['second_stage_nms_score_threshold']) + pipeline_config.model.ssd.post_processing.batch_non_max_suppression.iou_threshold = float(model_params['second_stage_nms_iou_threshold']) + pipeline_config.model.ssd.post_processing.batch_non_max_suppression.max_detections_per_class = int(model_params['second_stage_max_detections_per_class']) + pipeline_config.model.ssd.post_processing.batch_non_max_suppression.max_total_detections = int(model_params['second_stage_max_detections_max_total_detections']) + pipeline_config.model.ssd.loss.classification_weight = float(model_params['second_stage_classification_loss_weight']) + pipeline_config.model.ssd.loss.localization_weight = float(model_params['second_stage_localization_loss_weight']) + + pipeline_config.train_config.optimizer.rms_prop_optimizer.learning_rate.exponential_decay_learning_rate.initial_learning_rate = float(model_params['initial_learning_rate']) + pipeline_config.train_config.optimizer.rms_prop_optimizer.learning_rate.exponential_decay_learning_rate.decay_steps = int(model_params['decay_steps']) + pipeline_config.train_config.optimizer.rms_prop_optimizer.learning_rate.exponential_decay_learning_rate.decay_factor = float(model_params['decay_factor']) + pipeline_config.train_config.optimizer.rms_prop_optimizer.momentum_optimizer_value = float(model_params['momentum_optimizer_value']) + pipeline_config.train_config.optimizer.rms_prop_optimizer.decay = float(model_params['momentum_decay']) + pipeline_config.train_config.optimizer.rms_prop_optimizer.epsilon = float(model_params['momentum_epsilon']) + pipeline_config.train_config.batch_size = int(model_params['batch_size']) + else: #faster-rcnn based models + pipeline_config.model.faster_rcnn.num_classes=int(model_params['num_classes']) + pipeline_config.model.faster_rcnn.image_resizer.keep_aspect_ratio_resizer.min_dimension = int(model_params['min_dimension']) + pipeline_config.model.faster_rcnn.image_resizer.keep_aspect_ratio_resizer.max_dimension = int(model_params['max_dimension']) + pipeline_config.model.faster_rcnn.first_stage_anchor_generator.grid_anchor_generator.height_stride = int(model_params['height_stride']) + pipeline_config.model.faster_rcnn.first_stage_anchor_generator.grid_anchor_generator.width_stride = int(model_params['width_stride']) + pipeline_config.model.faster_rcnn.first_stage_box_predictor_conv_hyperparams.regularizer.l2_regularizer.weight = float(model_params['first_stage_regularizer_weight']) + pipeline_config.model.faster_rcnn.first_stage_box_predictor_conv_hyperparams.initializer.truncated_normal_initializer.stddev = float(model_params['first_stage_initializer_stddev']) + pipeline_config.model.faster_rcnn.first_stage_nms_score_threshold = float(model_params['first_stage_nms_score_threshold']) + pipeline_config.model.faster_rcnn.first_stage_nms_iou_threshold = float(model_params['first_stage_nms_iou_threshold']) + pipeline_config.model.faster_rcnn.first_stage_max_proposals = int(model_params['first_stage_max_proposals']) + pipeline_config.model.faster_rcnn.first_stage_localization_loss_weight = float(model_params['first_stage_localization_loss_weight']) + pipeline_config.model.faster_rcnn.first_stage_objectness_loss_weight = float(model_params['first_stage_objectness_loss_weight']) + pipeline_config.model.faster_rcnn.initial_crop_size = int(model_params['initial_crop_size']) + pipeline_config.model.faster_rcnn.maxpool_kernel_size = int(model_params['maxpool_kernel_size']) + pipeline_config.model.faster_rcnn.maxpool_stride = int(model_params['maxpool_stride']) + pipeline_config.model.faster_rcnn.second_stage_box_predictor.mask_rcnn_box_predictor.fc_hyperparams.regularizer.l2_regularizer.weight = float(model_params['second_stage_regularizer_weight']) + pipeline_config.model.faster_rcnn.second_stage_box_predictor.mask_rcnn_box_predictor.fc_hyperparams.initializer.variance_scaling_initializer.factor = float(model_params['second_stage_initializer_factor']) + pipeline_config.model.faster_rcnn.second_stage_box_predictor.mask_rcnn_box_predictor.fc_hyperparams.initializer.variance_scaling_initializer.mode = int(model_params['second_stage_initializer_mode']) + pipeline_config.model.faster_rcnn.second_stage_box_predictor.mask_rcnn_box_predictor.use_dropout = model_params['second_stage_use_dropout'] + pipeline_config.model.faster_rcnn.second_stage_box_predictor.mask_rcnn_box_predictor.dropout_keep_probability = float(model_params['second_stage_dropout_keep_probability']) + pipeline_config.model.faster_rcnn.second_stage_post_processing.batch_non_max_suppression.score_threshold = float(model_params['second_stage_nms_score_threshold']) + pipeline_config.model.faster_rcnn.second_stage_post_processing.batch_non_max_suppression.iou_threshold = float(model_params['second_stage_nms_iou_threshold']) + pipeline_config.model.faster_rcnn.second_stage_post_processing.batch_non_max_suppression.max_detections_per_class = int(model_params['second_stage_max_detections_per_class']) + pipeline_config.model.faster_rcnn.second_stage_post_processing.batch_non_max_suppression.max_total_detections = int(model_params['second_stage_max_detections_max_total_detections']) + pipeline_config.model.faster_rcnn.second_stage_localization_loss_weight = float(model_params['second_stage_localization_loss_weight']) + pipeline_config.model.faster_rcnn.second_stage_classification_loss_weight = float(model_params['second_stage_classification_loss_weight']) + + pipeline_config.train_config.optimizer.momentum_optimizer.learning_rate.cosine_decay_learning_rate.learning_rate_base = float(model_params['initial_learning_rate']) + pipeline_config.train_config.optimizer.momentum_optimizer.learning_rate.cosine_decay_learning_rate.total_steps = int(model_params['epochs']) + pipeline_config.train_config.optimizer.momentum_optimizer.learning_rate.cosine_decay_learning_rate.warmup_learning_rate = float(model_params['warmup_learning_rate']) + pipeline_config.train_config.optimizer.momentum_optimizer.learning_rate.cosine_decay_learning_rate.warmup_steps = int(model_params['warmup_steps']) + pipeline_config.train_config.optimizer.momentum_optimizer.momentum_optimizer_value = float(model_params['momentum_optimizer_value']) + pipeline_config.train_config.batch_size = int(model_params['num_clones']) + + pipeline_config.train_config.fine_tune_checkpoint = model_path + pipeline_config.train_config.fine_tune_checkpoint_type = "detection" + pipeline_config.train_config.num_steps = int(model_params['epochs']) + if len(model_params['sys_finetune_checkpoint'])>1: + pipeline_config.train_config.load_all_detection_checkpoint_vars = True + pipeline_config.train_input_reader.label_map_path = label_path + pipeline_config.train_input_reader.tf_record_input_reader.input_path[0] = train_tfrecord_path + + pipeline_config.eval_input_reader[0].label_map_path = label_path + pipeline_config.eval_input_reader[0].tf_record_input_reader.input_path[0] = eval_tfrecord_path + + config_text = text_format.MessageToString(pipeline_config) + with tf.io.gfile.GFile(out_pipeline_path, 'wb') as f: + f.write(config_text) + return model_params