Skip to content
Open
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ wheels/
/temp
MANIFEST
.locks/
tmp_test_checkpoints/

# PyInstaller
# Usually these files are written by a python script from a template
Expand Down Expand Up @@ -143,6 +144,7 @@ images
/custom/
megatron_output/
.qoder
.worktrees/

# Pytorch
*.pth
Expand Down
23 changes: 21 additions & 2 deletions client_tools/client_generator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) ModelScope Contributors. All rights reserved.
# Copyright (c) ModelScope Contributors. All rights reserved.
import ast
from pathlib import Path
from typing import Dict, List, Set, Tuple
Expand Down Expand Up @@ -448,6 +448,7 @@ def generate_models():
GetStateDictResponse,
GetTrainConfigsResponse,
SaveResponse,
TrainingProgressResponse,
)


Expand Down Expand Up @@ -617,6 +618,23 @@ def load(self, name: str, **kwargs) -> None:
)
response.raise_for_status()

def load_training_state(self, name: str, **kwargs) -> None:
"""Load optimizer, scheduler, scaler, RNG, and progress metadata from a checkpoint."""
response = http_post(
url=f'{self.server_url}/load_training_state',
json_data={'name': name, 'adapter_name': self.adapter_name, **kwargs}
)
response.raise_for_status()

def read_training_progress(self, name: str, **kwargs) -> Dict[str, Any]:
"""Read progress-only checkpoint metadata for resume-only-model flows."""
response = http_post(
url=f'{self.server_url}/read_training_progress',
json_data={'name': name, 'adapter_name': self.adapter_name, **kwargs}
)
response.raise_for_status()
return TrainingProgressResponse(**response.json()).result

def apply_patch(self, patch_cls: str, **kwargs) -> None:
"""Apply a patch to the model."""
response = http_post(
Expand Down Expand Up @@ -850,4 +868,5 @@ def apply_patch(self, patch_cls: str, **kwargs) -> None:
generate_samplers()

print('\n' + '=' * 60)
print('\n✓ All client code generation complete!\n')
print('\nAll client code generation complete!\n')

24 changes: 21 additions & 3 deletions docs/source_zh/使用指引/Qwen3.5最佳实践.md
Original file line number Diff line number Diff line change
Expand Up @@ -410,9 +410,19 @@ def train():
model.set_lr_scheduler('LinearLR')

# 恢复训练(如有检查点)
if resume_path:
logger.info(f'Resuming training from {resume_path}')
model.load(resume_path, load_optimizer=True)
resume_from_checkpoint = resume_path
resume_only_model = False
ignore_data_skip = False
if resume_from_checkpoint:
logger.info(f'Resuming training from {resume_from_checkpoint}')
model.load(name=resume_from_checkpoint)

if not resume_only_model:
trainer_state = model.load_training_state(resume_from_checkpoint)
dataloader.skip_consumed_samples(trainer_state['consumed_train_samples'])
elif not ignore_data_skip:
progress = model.read_training_progress(resume_from_checkpoint)
dataloader.skip_consumed_samples(progress['consumed_train_samples'])

logger.info(model.get_train_configs())

Expand Down Expand Up @@ -445,6 +455,14 @@ if __name__ == '__main__':
- 支持断点续训、检查点管理
- 可动态切换 LoRA 适配器、损失函数、优化器等组件

Resume 模式:

- `resume_from_checkpoint=None`:开始新的训练任务。
- `resume_only_model=False`:恢复权重、optimizer、scheduler、scaler、RNG 和进度元数据。
- `resume_only_model=True` 且 `ignore_data_skip=False`:恢复权重,读取进度元数据,并跳过已消费样本。
- `resume_only_model=True` 且 `ignore_data_skip=True`:只恢复权重,训练步数和数据进度从 0 开始。
- `skip_consumed_samples(...)` 不适用于 iterable / streaming dataset。

### 3.2 Tinker Client:简洁即用

Tinker 是一个轻量级训练 API。Twinkle 对 Tinker 客户端提供完整支持,几行代码就能拉起训练。已有 Tinker 代码的项目可以直接迎移到 Twinkle 服务端。
Expand Down
Loading
Loading