Hi,
I'm trying to run the example notebook given for solubility prediction.
I am getting an error after running code from one of the cells as shown below. I have used the code from your utils file which checks for the presence of the accelerate module as an additional cell in the running notebook. This returns True for the presence of accelerate. Is there any guidance that you could provide yto make this work? Thanks in afdvance.
Cheers,
Saif
---------Code from cell-------------------------
model_type = 'ankh_large'
experiment = f'solubility_{model_type}'
training_args = TrainingArguments(
output_dir=f'./results_{experiment}',
num_train_epochs=5,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
warmup_steps=1000,
learning_rate=1e-03,
weight_decay=0.0,
logging_dir=f'./logs_{experiment}',
logging_steps=200,
do_train=True,
do_eval=True,
evaluation_strategy="epoch",
gradient_accumulation_steps=16,
fp16=False,
fp16_opt_level="02",
run_name=experiment,
seed=seed,
load_best_model_at_end=True,
metric_for_best_model="eval_accuracy",
greater_is_better=True,
save_strategy="epoch"
)
---------Errror--------------------------
ImportError Traceback (most recent call last)
Cell In[34], line 4
1 model_type = 'ankh_large'
2 experiment = f'solubility_{model_type}'
----> 4 training_args = TrainingArguments(
5 output_dir=f'./results_{experiment}',
6 num_train_epochs=5,
7 per_device_train_batch_size=1,
8 per_device_eval_batch_size=1,
9 warmup_steps=1000,
10 learning_rate=1e-03,
11 weight_decay=0.0,
12 logging_dir=f'./logs_{experiment}',
13 logging_steps=200,
14 do_train=True,
15 do_eval=True,
16 evaluation_strategy="epoch",
17 gradient_accumulation_steps=16,
18 fp16=False,
19 fp16_opt_level="02",
20 run_name=experiment,
21 seed=seed,
22 load_best_model_at_end=True,
23 metric_for_best_model="eval_accuracy",
24 greater_is_better=True,
25 save_strategy="epoch"
26 )
File :121, in init(self, output_dir, overwrite_output_dir, do_train, do_eval, do_predict, evaluation_strategy, prediction_loss_only, per_device_train_batch_size, per_device_eval_batch_size, per_gpu_train_batch_size, per_gpu_eval_batch_size, gradient_accumulation_steps, eval_accumulation_steps, eval_delay, learning_rate, weight_decay, adam_beta1, adam_beta2, adam_epsilon, max_grad_norm, num_train_epochs, max_steps, lr_scheduler_type, lr_scheduler_kwargs, warmup_ratio, warmup_steps, log_level, log_level_replica, log_on_each_node, logging_dir, logging_strategy, logging_first_step, logging_steps, logging_nan_inf_filter, save_strategy, save_steps, save_total_limit, save_safetensors, save_on_each_node, save_only_model, no_cuda, use_cpu, use_mps_device, seed, data_seed, jit_mode_eval, use_ipex, bf16, fp16, fp16_opt_level, half_precision_backend, bf16_full_eval, fp16_full_eval, tf32, local_rank, ddp_backend, tpu_num_cores, tpu_metrics_debug, debug, dataloader_drop_last, eval_steps, dataloader_num_workers, past_index, run_name, disable_tqdm, remove_unused_columns, label_names, load_best_model_at_end, metric_for_best_model, greater_is_better, ignore_data_skip, fsdp, fsdp_min_num_params, fsdp_config, fsdp_transformer_layer_cls_to_wrap, deepspeed, label_smoothing_factor, optim, optim_args, adafactor, group_by_length, length_column_name, report_to, ddp_find_unused_parameters, ddp_bucket_cap_mb, ddp_broadcast_buffers, dataloader_pin_memory, dataloader_persistent_workers, skip_memory_metrics, use_legacy_prediction_loop, push_to_hub, resume_from_checkpoint, hub_model_id, hub_strategy, hub_token, hub_private_repo, hub_always_push, gradient_checkpointing, gradient_checkpointing_kwargs, include_inputs_for_metrics, fp16_backend, push_to_hub_model_id, push_to_hub_organization, push_to_hub_token, mp_parameters, auto_find_batch_size, full_determinism, torchdynamo, ray_scope, ddp_timeout, torch_compile, torch_compile_backend, torch_compile_mode, dispatch_batches, split_batches, include_tokens_per_second, include_num_input_tokens_seen, neftune_noise_alpha)
File ~/ankh_venv/lib/python3.10/site-packages/transformers/training_args.py:1483, in TrainingArguments.post_init(self)
1477 if version.parse(version.parse(torch.version).base_version) == version.parse("2.0.0") and self.fp16:
1478 raise ValueError("--optim adamw_torch_fused with --fp16 requires PyTorch>2.0")
1480 if (
1481 self.framework == "pt"
1482 and is_torch_available()
-> 1483 and (self.device.type != "cuda")
1484 and (self.device.type != "npu")
1485 and (self.device.type != "xpu")
1486 and (get_xla_device_type(self.device) != "GPU")
1487 and (self.fp16 or self.fp16_full_eval)
1488 ):
1489 raise ValueError(
1490 "FP16 Mixed precision training with AMP or APEX (--fp16
) and FP16 half precision evaluation"
1491 " (--fp16_full_eval
) can only be used on CUDA or NPU devices or certain XPU devices (with IPEX)."
1492 )
1494 if (
1495 self.framework == "pt"
1496 and is_torch_available()
(...)
1503 and (self.bf16 or self.bf16_full_eval)
1504 ):
File ~/ankh_venv/lib/python3.10/site-packages/transformers/training_args.py:1921, in TrainingArguments.device(self)
1917 """
1918 The device used by this process.
1919 """
1920 requires_backends(self, ["torch"])
-> 1921 return self._setup_devices
File ~/ankh_venv/lib/python3.10/site-packages/transformers/utils/generic.py:54, in cached_property.get(self, obj, objtype)
52 cached = getattr(obj, attr, None)
53 if cached is None:
---> 54 cached = self.fget(obj)
55 setattr(obj, attr, cached)
56 return cached
File ~/ankh_venv/lib/python3.10/site-packages/transformers/training_args.py:1831, in TrainingArguments._setup_devices(self)
1829 if not is_sagemaker_mp_enabled():
1830 if not is_accelerate_available():
-> 1831 raise ImportError(
1832 f"Using the Trainer
with PyTorch
requires accelerate>={ACCELERATE_MIN_VERSION}
: "
1833 "Please run pip install transformers[torch]
or pip install accelerate -U
"
1834 )
1835 AcceleratorState._reset_state(reset_partial_state=True)
1836 self.distributed_state = None
ImportError: Using the Trainer
with PyTorch
requires accelerate>=0.21.0
: Please run pip install transformers[torch]
or pip install accelerate -U