mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-30 17:52:35 +06:00
add gradient accumulation
This commit is contained in:
parent
6b0da96b4b
commit
1ceac85e23
@ -426,7 +426,7 @@ def main():
|
||||
parser.add_argument("--accumulate_gradients",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of steps to accumulate gradient on (divide the single step batch_size)")
|
||||
help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)")
|
||||
parser.add_argument("--local_rank",
|
||||
type=int,
|
||||
default=-1,
|
||||
@ -452,10 +452,17 @@ def main():
|
||||
# print("Initializing the distributed backend: NCCL")
|
||||
print("device", device, "n_gpu", n_gpu)
|
||||
|
||||
if args.accumulate_gradients < 1:
|
||||
raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format(
|
||||
args.accumulate_gradients))
|
||||
|
||||
args.batch_size = args.batch_size / args.accumulate_gradients
|
||||
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
if n_gpu>0: torch.cuda.manual_seed_all(args.seed)
|
||||
if n_gpu > 0:
|
||||
torch.cuda.manual_seed_all(args.seed)
|
||||
|
||||
if not args.do_train and not args.do_eval:
|
||||
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
|
||||
@ -531,11 +538,10 @@ def main():
|
||||
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
|
||||
|
||||
model.train()
|
||||
|
||||
for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
|
||||
tr_loss = 0
|
||||
nb_tr_examples, nb_tr_steps = 0, 0
|
||||
for input_ids, input_mask, segment_ids, label_ids in tqdm(train_dataloader, desc="Iteration"):
|
||||
for step, (input_ids, input_mask, segment_ids, label_ids) in enumerate(tqdm(train_dataloader, desc="Iteration")):
|
||||
input_ids = input_ids.to(device)
|
||||
input_mask = input_mask.float().to(device)
|
||||
segment_ids = segment_ids.to(device)
|
||||
@ -546,12 +552,13 @@ def main():
|
||||
loss = loss.mean() # mean() to average on multi-gpu.
|
||||
tr_loss += loss.item()
|
||||
nb_tr_examples += input_ids.size(0)
|
||||
|
||||
model.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
global_step += 1
|
||||
nb_tr_steps += 1
|
||||
loss.backward()
|
||||
|
||||
if (step + 1) % args.gradient_accumulation_steps == 0:
|
||||
optimizer.step() # We have accumulated enought gradients
|
||||
model.zero_grad()
|
||||
global_step += 1
|
||||
|
||||
if args.do_eval:
|
||||
eval_examples = processor.get_dev_examples(args.data_dir)
|
||||
|
21
run_squad.py
21
run_squad.py
@ -731,10 +731,14 @@ def main():
|
||||
type=int,
|
||||
default=-1,
|
||||
help="local_rank for distributed training on gpus")
|
||||
parser.add_argument("--accumulate_gradients",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)")
|
||||
parser.add_argument('--seed',
|
||||
type=int,
|
||||
default=42,
|
||||
help="random seed for initialization")
|
||||
type=int,
|
||||
default=42,
|
||||
help="random seed for initialization")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -836,8 +840,8 @@ def main():
|
||||
|
||||
model.train()
|
||||
for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
|
||||
for input_ids, input_mask, segment_ids, start_positions, end_positions in tqdm(train_dataloader,
|
||||
desc="Iteration"):
|
||||
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
|
||||
input_ids, input_mask, segment_ids, start_positions, end_positions = batch
|
||||
input_ids = input_ids.to(device)
|
||||
input_mask = input_mask.float().to(device)
|
||||
segment_ids = segment_ids.to(device)
|
||||
@ -851,10 +855,11 @@ def main():
|
||||
if n_gpu > 1:
|
||||
loss = loss.mean() # mean() to average on multi-gpu.
|
||||
|
||||
model.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
global_step += 1
|
||||
if (step + 1) % args.gradient_accumulation_steps == 0:
|
||||
optimizer.step() # We have accumulated enought gradients
|
||||
model.zero_grad()
|
||||
global_step += 1
|
||||
|
||||
if args.do_predict:
|
||||
eval_examples = read_squad_examples(
|
||||
|
Loading…
Reference in New Issue
Block a user