examples/train_texteller/train_config.yaml

# For more information, please refer to the official documentation: https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments

seed: 42  # Random seed for reproducibility
use_cpu: false  # Whether to use CPU (it's easier to debug with CPU when starting to test the code)
learning_rate: 5.0e-5  # Learning rate
num_train_epochs: 10  # Total number of training epochs
per_device_train_batch_size: 4  # Batch size per GPU for training
per_device_eval_batch_size: 8  # Batch size per GPU for evaluation
output_dir: "train_result"  # Output directory
overwrite_output_dir: false  # If the output directory exists, do not delete its content
report_to:
  - tensorboard  # Report logs to TensorBoard
save_strategy: "steps"  # Strategy to save checkpoints
save_steps: 500  # Interval of steps to save checkpoints, can be int or a float (0~1), when float it represents the ratio of total training steps (e.g., can set to 1.0 / 2000)
save_total_limit: 5  # Maximum number of models to save. The oldest models will be deleted if this number is exceeded
logging_strategy: "steps"  # Log every certain number of steps
logging_steps: 500  # Number of steps between each log
logging_nan_inf_filter: false  # Record logs for loss=nan or inf
optim: "adamw_torch"  # Optimizer
lr_scheduler_type: "cosine"  # Learning rate scheduler
warmup_ratio: 0.1  # Ratio of warmup steps in total training steps (e.g., for 1000 steps, the first 100 steps gradually increase lr from 0 to the set lr)
max_grad_norm: 1.0  # For gradient clipping, ensure the norm of the gradients does not exceed 1.0 (default 1.0)
fp16: false  # Whether to use 16-bit floating point for training (generally not recommended, as loss can easily explode)
bf16: false  # Whether to use Brain Floating Point (bfloat16) for training (recommended if architecture supports it)
gradient_accumulation_steps: 1  # Gradient accumulation steps, consider this parameter to achieve large batch size effects when batch size cannot be large
jit_mode_eval: false  # Whether to use PyTorch jit trace during eval (can speed up the model, but the model must be static, otherwise will throw errors)
torch_compile: false  # Whether to use torch.compile to compile the model (for better training and inference performance)
dataloader_pin_memory: true  # Can speed up data transfer between CPU and GPU
dataloader_num_workers: 1  # Default is not to use multiprocessing for data loading, usually set to 4*number of GPUs used
evaluation_strategy: "steps"  # Evaluation strategy, can be "steps" or "epoch"
eval_steps: 500  # If evaluation_strategy="step"
remove_unused_columns: false  # Don't change this unless you really know what you are doing.
[feat] Add texteller training script 2025-04-19 16:29:49 +00:00			`# For more information, please refer to the official documentation: https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments`

			`seed: 42 # Random seed for reproducibility`
			`use_cpu: false # Whether to use CPU (it's easier to debug with CPU when starting to test the code)`
			`learning_rate: 5.0e-5 # Learning rate`
			`num_train_epochs: 10 # Total number of training epochs`
			`per_device_train_batch_size: 4 # Batch size per GPU for training`
			`per_device_eval_batch_size: 8 # Batch size per GPU for evaluation`
			`output_dir: "train_result" # Output directory`
			`overwrite_output_dir: false # If the output directory exists, do not delete its content`
			`report_to:`
			`- tensorboard # Report logs to TensorBoard`
			`save_strategy: "steps" # Strategy to save checkpoints`
			`save_steps: 500 # Interval of steps to save checkpoints, can be int or a float (0~1), when float it represents the ratio of total training steps (e.g., can set to 1.0 / 2000)`
			`save_total_limit: 5 # Maximum number of models to save. The oldest models will be deleted if this number is exceeded`
			`logging_strategy: "steps" # Log every certain number of steps`
			`logging_steps: 500 # Number of steps between each log`
			`logging_nan_inf_filter: false # Record logs for loss=nan or inf`
			`optim: "adamw_torch" # Optimizer`
			`lr_scheduler_type: "cosine" # Learning rate scheduler`
			`warmup_ratio: 0.1 # Ratio of warmup steps in total training steps (e.g., for 1000 steps, the first 100 steps gradually increase lr from 0 to the set lr)`
			`max_grad_norm: 1.0 # For gradient clipping, ensure the norm of the gradients does not exceed 1.0 (default 1.0)`
			`fp16: false # Whether to use 16-bit floating point for training (generally not recommended, as loss can easily explode)`
			`bf16: false # Whether to use Brain Floating Point (bfloat16) for training (recommended if architecture supports it)`
			`gradient_accumulation_steps: 1 # Gradient accumulation steps, consider this parameter to achieve large batch size effects when batch size cannot be large`
			`jit_mode_eval: false # Whether to use PyTorch jit trace during eval (can speed up the model, but the model must be static, otherwise will throw errors)`
			`torch_compile: false # Whether to use torch.compile to compile the model (for better training and inference performance)`
			`dataloader_pin_memory: true # Can speed up data transfer between CPU and GPU`
			`dataloader_num_workers: 1 # Default is not to use multiprocessing for data loading, usually set to 4*number of GPUs used`
			`evaluation_strategy: "steps" # Evaluation strategy, can be "steps" or "epoch"`
			`eval_steps: 500 # If evaluation_strategy="step"`
			`remove_unused_columns: false # Don't change this unless you really know what you are doing.`