This repository was archived by the owner on Apr 11, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 398
/
Copy pathtrain.py
executable file
·191 lines (159 loc) · 9.42 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/env python
"""
Usage:
train.py [options] SAVE_FOLDER TRAIN_DATA_PATH VALID_DATA_PATH TEST_DATA_PATH
train.py [options] [SAVE_FOLDER]
*_DATA_PATH arguments may either accept (1) directory filled with .jsonl.gz files that we use as data,
or a (2) plain text file containing a list of such directories (used for multi-language training).
In the case that you supply a (2) plain text file, all directory names must be separated by a newline.
For example, if you want to read from multiple directories you might have a plain text file called
data_dirs_train.txt with the below contents:
> cat ~/src/data_dirs_train.txt
azure://semanticcodesearch/pythondata/Processed_Data/jsonl/train
azure://semanticcodesearch/csharpdata/split/csharpCrawl-train
Options:
-h --help Show this screen.
--max-num-epochs EPOCHS The maximum number of epochs to run [default: 300]
--max-files-per-dir INT Maximum number of files per directory to load for training data.
--hypers-override HYPERS JSON dictionary overriding hyperparameter values.
--hypers-override-file FILE JSON file overriding hyperparameter values.
--model MODELNAME Choose model type. [default: neuralbowmodel]
--test-batch-size SIZE The size of the batches in which to compute MRR. [default: 1000]
--distance-metric METRIC The distance metric to use [default: cosine]
--run-name NAME Picks a name for the trained model.
--quiet Less output (not one per line per minibatch). [default: False]
--dryrun Do not log run into logging database. [default: False]
--testrun Do a model run on a small dataset for testing purposes. [default: False]
--azure-info PATH Azure authentication information file (JSON). Used to load data from Azure storage.
--evaluate-model PATH Run evaluation on previously trained model.
--sequential Do not parallelise data-loading. Simplifies debugging. [default: False]
--debug Enable debug routines. [default: False]
"""
import json
import os
import sys
import time
from typing import Type, Dict, Any, Optional, List
from pathlib import Path
from docopt import docopt
from dpu_utils.utils import RichPath, git_tag_run, run_and_debug
import wandb
import model_restore_helper
from model_test import compute_evaluation_metrics
from models.model import Model
import model_test as test
def run_train(model_class: Type[Model],
train_data_dirs: List[RichPath],
valid_data_dirs: List[RichPath],
save_folder: str,
hyperparameters: Dict[str, Any],
azure_info_path: Optional[str],
run_name: str,
quiet: bool = False,
max_files_per_dir: Optional[int] = None,
parallelize: bool = True) -> RichPath:
model = model_class(hyperparameters, run_name=run_name, model_save_dir=save_folder, log_save_dir=save_folder)
if os.path.exists(model.model_save_path):
model = model_restore_helper.restore(RichPath.create(model.model_save_path), is_train=True)
model.train_log("Resuming training run %s of model %s with following hypers:\n%s" % (run_name,
model.__class__.__name__,
str(hyperparameters)))
resume = True
else:
model.train_log("Tokenizing and building vocabulary for code snippets and queries. This step may take several hours.")
model.load_metadata(train_data_dirs, max_files_per_dir=max_files_per_dir, parallelize=parallelize)
model.make_model(is_train=True)
model.train_log("Starting training run %s of model %s with following hypers:\n%s" % (run_name,
model.__class__.__name__,
str(hyperparameters)))
resume = False
philly_job_id = os.environ.get('PHILLY_JOB_ID')
if philly_job_id is not None:
# We are in Philly write out the model name in an auxiliary file
with open(os.path.join(save_folder, philly_job_id+'.job'), 'w') as f:
f.write(os.path.basename(model.model_save_path))
wandb.config.update(model.hyperparameters)
model.train_log("Loading training and validation data.")
train_data = model.load_data_from_dirs(train_data_dirs, is_test=False, max_files_per_dir=max_files_per_dir, parallelize=parallelize)
valid_data = model.load_data_from_dirs(valid_data_dirs, is_test=False, max_files_per_dir=max_files_per_dir, parallelize=parallelize)
model.train_log("Begin Training.")
model_path = model.train(train_data, valid_data, azure_info_path, quiet=quiet, resume=resume)
return model_path
def make_run_id(arguments: Dict[str, Any]) -> str:
"""Choose a run ID, based on the --save-name parameter, the PHILLY_JOB_ID and the current time."""
philly_id = os.environ.get('PHILLY_JOB_ID')
if philly_id is not None:
return philly_id
user_save_name = arguments.get('--run-name')
if user_save_name is not None:
user_save_name = user_save_name[:-len('.pkl')] if user_save_name.endswith('.pkl') else user_save_name
else:
user_save_name = arguments['--model']
return "%s-%s" % (user_save_name, time.strftime("%Y-%m-%d-%H-%M-%S"))
def run(arguments, tag_in_vcs=False) -> None:
azure_info_path = arguments.get('--azure-info', None)
testrun = arguments.get('--testrun')
max_files_per_dir=arguments.get('--max-files-per-dir')
dir_path = Path(__file__).parent.absolute()
# if you do not pass arguments for train/valid/test data default to files checked into repo.
if not arguments['TRAIN_DATA_PATH']:
arguments['TRAIN_DATA_PATH'] = str(dir_path/'data_dirs_train.txt')
arguments['VALID_DATA_PATH'] = str(dir_path/'data_dirs_valid.txt')
arguments['TEST_DATA_PATH'] = str(dir_path/'data_dirs_test.txt')
train_data_dirs = test.expand_data_path(arguments['TRAIN_DATA_PATH'], azure_info_path)
valid_data_dirs = test.expand_data_path(arguments['VALID_DATA_PATH'], azure_info_path)
test_data_dirs = test.expand_data_path(arguments['TEST_DATA_PATH'], azure_info_path)
# default model save location
if not arguments['SAVE_FOLDER']:
arguments['SAVE_FOLDER'] = str(dir_path.parent/'resources/saved_models/')
save_folder = arguments['SAVE_FOLDER']
model_class = model_restore_helper.get_model_class_from_name(arguments['--model'])
hyperparameters = model_class.get_default_hyperparameters()
run_name = make_run_id(arguments)
# make name of wandb run = run_id (Doesn't populate yet)
hyperparameters['max_epochs'] = int(arguments.get('--max-num-epochs'))
if testrun:
hyperparameters['max_epochs'] = 2
if not max_files_per_dir:
max_files_per_dir = 1
# override hyperparams if flag is passed
hypers_override = arguments.get('--hypers-override')
if hypers_override is not None:
hyperparameters.update(json.loads(hypers_override))
elif arguments.get('--hypers-override-file') is not None:
with open(arguments.get('--hypers-override-file')) as f:
hyperparameters.update(json.load(f))
os.makedirs(save_folder, exist_ok=True)
if tag_in_vcs:
hyperparameters['git_commit'] = git_tag_run(run_name)
# turns off wandb if you don't want to log anything
if arguments.get('--dryrun'):
os.environ["WANDB_MODE"] = 'dryrun'
# save hyperparams to logging
# must filter out type=set from logging when as that is not json serializable
wandb.init(name=run_name, config={k: v for k, v in hyperparameters.items() if not isinstance(v, set)})
wandb.config.update({'model-class': arguments['--model'],
'train_folder': str(train_data_dirs),
'valid_folder': str(valid_data_dirs),
'save_folder': str(save_folder),
'test_folder': str(test_data_dirs),
'CUDA_VISIBLE_DEVICES': os.environ.get("CUDA_VISIBLE_DEVICES", 'Not Set'),
'run-name': arguments.get('--run-name'),
'CLI-command': ' '.join(sys.argv)})
if arguments.get('--evaluate-model'):
model_path = RichPath.create(arguments['--evaluate-model'])
else:
model_path = run_train(model_class, train_data_dirs, valid_data_dirs, save_folder, hyperparameters,
azure_info_path, run_name, arguments['--quiet'],
max_files_per_dir=max_files_per_dir,
parallelize=not(arguments['--sequential']))
wandb.config['best_model_path'] = str(model_path)
wandb.save(str(model_path.to_local_path()))
# only limit files in test run if `--testrun` flag is passed by user.
if testrun:
compute_evaluation_metrics(model_path, arguments, azure_info_path, valid_data_dirs, test_data_dirs, max_files_per_dir)
else:
compute_evaluation_metrics(model_path, arguments, azure_info_path, valid_data_dirs, test_data_dirs)
if __name__ == '__main__':
args = docopt(__doc__)
run_and_debug(lambda: run(args), args['--debug'])