NASLib/naslib/defaults/drnas_defaults.yaml at 09641d05172f19852510cf1baa27d4f952a39f48 · automl/NASLib · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# options cifar10, cifar100, ImageNet16-120 reports on their test acc is avaliable
dataset: ImageNet16-120
# in the code base the deafult value for the seed is 2.
# using random seeds that are logged but log files are not provided
# not mentioned in the paper what are the random seeds are
seed: 99
# darts (or nb301)
# nb201
search_space: nasbench301
out_dir: run
optimizer: drnas

search:
  checkpoint_freq: 5
  # default value batch size in code is 64
  batch_size: 64
  # lr_rate for progressive and original: 0.025
  learning_rate: 0.025
  # lr_rate for progressive and original: 0.025
  learning_rate_min: 0.001
  momentum: 0.9
  # weight_decay for progressive and original: 0.0003
  weight_decay: 0.0003
  # for cifar10 the learning process is 2 stages of 25 epochs each
  # in code it states that the number of training epochs has the default value of 100 in nb201
  epochs: 100
  warm_start_epochs: 0
  grad_clip: 5
  # for cifar10 the train and optimization data (50k) is equally partitioned
  train_portion: 0.5
  # for cifar10 the train and optimization data (50k) is equally partitioned
  data_size: 25000

  # for the four args the values are same for oridinary and progressive mode for nb201
  cutout: False
  cutout_length: 16
  cutout_prob: 1.0
  drop_path_prob: 0.0

  # for nb201 this value is false
  unrolled: False
  arch_learning_rate: 0.0003
  # not mentiond for progressive mode but for ordinary it is 1e-3 in nb201
  arch_weight_decay: 0.001
  output_weights: True

  fidelity: 200

  # GDAS
  tau_max: 10
  tau_min: 0.1

  # RE
  sample_size: 10
  population_size: 100

  #LS
  num_init: 10

  #GSparsity-> Uncomment the lines below for GSparsity
  #seed: 50
  #grad_clip: 0
  #threshold: 0.000001
  #weight_decay: 120
  #learning_rate: 0.01
  #momentum: 0.8
  #normalization: div
  #normalization_exponent: 0.5
  #batch_size: 256
  #learning_rate_min: 0.0001
  #epochs: 100
  #warm_start_epochs: 0
  #train_portion: 0.9
  #data_size: 25000


  # BANANAS
  k: 10
  num_ensemble: 3
  acq_fn_type: its
  acq_fn_optimization: mutation
  encoding_type: path
  num_arches_to_mutate: 2
  max_mutations: 1
  num_candidates: 100

  # BasePredictor
  predictor_type: var_sparse_gp
  debug_predictor: False

evaluation:
  checkpoint_freq: 30
  # Neither the paper nor the code base indicates the batch size but the default value is 64
  batch_size: 64

  learning_rate: 0.025
  learning_rate_min: 0.00
  # momentum is 0.9
  momentum: 0.9
  # for cifar weight_decay is 3e-4
  weight_decay: 0.0003
  # cifar's eval is 600 epochs, for imagenet it is 250
  epochs: 250
  # for image net it has 5 epochs of warm starting
  warm_start_epochs: 5
  grad_clip: 5
  # uses the whole training data of cifar10 (50K) to train from scratch for 600 epochs
  train_portion: 1.
  data_size: 50000

  # cifar10 the cutout is done to have fair comparisons with previous work
  cutout: True
  # cifar10 cutout length is 16
  cutout_length: 16
  # cifar10 the cutout is done to have fair comparisons with previous work
  cutout_prob: 1.0
  # cifar drop out is 0.3
  drop_path_prob: 0.2
  # cifar auxiliary is 0.4
  auxiliary_weight: 0.4


# has a partial channel variable that for oridinary is 1 and in progressive mode has 4 as the default value.
# mentions some things about regularization scale of l2 and kl (used for dirichlet) in code of nb201