Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
85b80aa
sync commit to master.
Jun 29, 2023
10c82cd
!837 sync commit to master.
Jun 30, 2023
90aebfc
刪除pip安裝.
Jul 3, 2023
7fb5bd3
!865 sync dev to matser
Jul 5, 2023
a7c7d60
修改测试用例中的save位置,修复仓上CI问题
Jul 5, 2023
a4a7f56
去除AutoClass和pipeline门禁维护.
Jul 5, 2023
f9e8c52
修改镜像库地址
hehesimida Jul 7, 2023
1d3dc2f
add gen config explanation
hehesimida Jul 6, 2023
4c3c378
修改分布式推理random.seed的bug
Jul 3, 2023
85a3172
修改断点续训init_epoch问题
Jun 30, 2023
b4b6d68
修复离线切分脚本bug
Jul 6, 2023
5994cd3
修复flicker8k_dataloader数据并行不生效问题
Jul 3, 2023
0510ba2
!885 sync commit to master
Jul 10, 2023
81d91f7
Merge branch 'master' of gitee.com:mindspore/mindformers into update-…
Jul 24, 2023
8142dbb
!921 sync commit to master
Jul 24, 2023
d96763e
security check
Jul 26, 2023
4969ff2
!929 security check
Jul 27, 2023
6bc1aa7
update README.md.
Jul 25, 2023
f767d1c
refactoring docs
AHHOZP Jul 21, 2023
928a999
create research and delete useless examples.
Jul 24, 2023
54f5af2
Training_args新增参数
Jul 6, 2023
4b92fe0
修改2.1依赖
Aug 2, 2023
3ddfd59
add promptacc metric for pangualpha
xinlianglalala Jul 24, 2023
1b9c1e7
add do_eval test cases
Jul 10, 2023
e77f8b5
hf tokenizer
hehesimida Jun 29, 2023
4df014f
update docs/README.md.
Jul 26, 2023
d68a6a7
update docs/README.md.
Jul 26, 2023
6a91b57
修改开源引用的文件头
Jul 26, 2023
aa86d6e
aicc docs improved
AHHOZP Jul 26, 2023
72de099
add wikitext2 preprocess for gpt2
xinlianglalala Jul 28, 2023
ec379c3
Support Baichuan-7b.
Jul 27, 2023
0c54ba2
增加断点续训readme
Jul 28, 2023
054414e
移动模型支持度表格到docs/models下面
Jul 19, 2023
e831df5
保存瘦身后的权重
Jul 11, 2023
7aeab46
add em_f1 metric
hehesimida Jul 26, 2023
6535d70
增加权重自动切分合并特性
Jul 18, 2023
61b61c7
修复bug
hehesimida Jul 31, 2023
9bf8e6d
fix aicc for graphs or dump file.
Jul 29, 2023
ecb968f
Parallel指导手册
Jul 29, 2023
e487baa
fix aicc for copytree.
Aug 1, 2023
add8871
修复glm文档datase示例问题
Aug 1, 2023
43d6b43
修改2.1依赖
Aug 2, 2023
62ee1a4
add cpm
Aug 3, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .jenkins/test/config/dependent_packages.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
mindspore:
'/mindspore/mindspore/version/202302/20230215/r1.10_20230215214139_9b65b907f96edf9a278f87e9ccab7d318aed310f/'
'/mindspore/mindspore/version/202307/20230731/r2.1_20230731201928_db56d4dffe0f593f9e983142169b74a4555058db/'
78 changes: 53 additions & 25 deletions README.md

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions configs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ configs统一在run_xxx.yaml中,排序按照修改频率的顺序和一般的
- prefix: 权重文件前缀
- save_checkpoint_steps: 每多少个step保存一次checkpoint
- integrated_save: 是否聚合保存。True时表示聚合所有卡权重,这时每张卡权重均一致;False时表示每张卡各自保存自己的权重;当半自动并行模式训练大模型时,通常需要设置为False,以保证权重保存时不会因为内存问题而失败
- save_network_params(新增): 是否额外保存瘦身后的权重。默认为True。
- save_trainable_params(新增): 是否额外保存可训练的参数权重,即微调部分参数的权重。默认为False。
- async_save: 是否异步执行保存checkpoint文件
- type: ObsMonitor: obs数据上传
- step_upload_frequence: 每隔多少个step上传一次,默认为-1,表示禁用step间隔上传;配置为大于0的数时,每隔配置数step后执行一次回传
Expand Down
1 change: 1 addition & 0 deletions configs/bert/run_bert_base_uncased.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ seed: 0
run_mode: 'train'
output_dir: './output' # 当前不支持自定义修改,请勿修改该默认值
load_checkpoint: ''
src_strategy_path_or_dir: ''
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
only_save_strategy: False
resume_training: False
Expand Down
1 change: 1 addition & 0 deletions configs/bert/run_bert_tiny_uncased.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ seed: 0
run_mode: 'train'
output_dir: './output' # 当前不支持自定义修改,请勿修改该默认值
load_checkpoint: ''
src_strategy_path_or_dir: ''
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
only_save_strategy: False
resume_training: False
Expand Down
1 change: 1 addition & 0 deletions configs/bloom/run_bloom_176b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ seed: 0
run_mode: 'train'
output_dir: './output' # 当前不支持自定义修改,请勿修改该默认值
load_checkpoint: ''
src_strategy_path_or_dir: ''
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
only_save_strategy: False
resume_training: False
Expand Down
1 change: 1 addition & 0 deletions configs/bloom/run_bloom_560m.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ seed: 0
run_mode: 'train'
output_dir: './output' # 当前不支持自定义修改,请勿修改该默认值
load_checkpoint: ''
src_strategy_path_or_dir: ''
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
only_save_strategy: False
resume_training: False
Expand Down
1 change: 1 addition & 0 deletions configs/bloom/run_bloom_65b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ seed: 0
run_mode: 'train'
output_dir: './output' # 当前不支持自定义修改,请勿修改该默认值
load_checkpoint: ''
src_strategy_path_or_dir: ''
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
only_save_strategy: False
resume_training: False
Expand Down
1 change: 1 addition & 0 deletions configs/bloom/run_bloom_7.1b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ seed: 0
run_mode: 'train'
output_dir: './output' # 当前不支持自定义修改,请勿修改该默认值
load_checkpoint: ''
src_strategy_path_or_dir: ''
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
only_save_strategy: False
resume_training: False
Expand Down
1 change: 1 addition & 0 deletions configs/clip/run_clip_vit_b_16_pretrain_flickr8k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ seed: 0
run_mode: 'train'
output_dir: './output' # 当前不支持自定义修改,请勿修改该默认值
load_checkpoint: ''
src_strategy_path_or_dir: ''
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
only_save_strategy: False
resume_training: False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ seed: 0
run_mode: 'eval'
output_dir: './output' # 当前不支持自定义修改,请勿修改该默认值
load_checkpoint: ''
src_strategy_path_or_dir: ''
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
only_save_strategy: False
resume_training: False
Expand Down
1 change: 1 addition & 0 deletions configs/clip/run_clip_vit_b_32_pretrain_flickr8k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ seed: 0
run_mode: 'train'
output_dir: './output' # 当前不支持自定义修改,请勿修改该默认值
load_checkpoint: ''
src_strategy_path_or_dir: ''
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
only_save_strategy: False
resume_training: False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ seed: 0
run_mode: 'eval'
output_dir: './output' # 当前不支持自定义修改,请勿修改该默认值
load_checkpoint: ''
src_strategy_path_or_dir: ''
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
only_save_strategy: False
resume_training: False
Expand Down
1 change: 1 addition & 0 deletions configs/clip/run_clip_vit_l_14@336_pretrain_flickr8k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ seed: 0
run_mode: 'train'
output_dir: './output' # 当前不支持自定义修改,请勿修改该默认值
load_checkpoint: ''
src_strategy_path_or_dir: ''
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
only_save_strategy: False
resume_training: False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ seed: 0
run_mode: 'eval'
output_dir: './output' # 当前不支持自定义修改,请勿修改该默认值
load_checkpoint: ''
src_strategy_path_or_dir: ''
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
only_save_strategy: False
resume_training: False
Expand Down
1 change: 1 addition & 0 deletions configs/clip/run_clip_vit_l_14_pretrain_flickr8k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ seed: 0
run_mode: 'train'
output_dir: './output' # 当前不支持自定义修改,请勿修改该默认值
load_checkpoint: ''
src_strategy_path_or_dir: ''
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
only_save_strategy: False
resume_training: False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ seed: 0
run_mode: 'eval'
output_dir: './output' # 当前不支持自定义修改,请勿修改该默认值
load_checkpoint: ''
src_strategy_path_or_dir: ''
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
only_save_strategy: False
resume_training: False
Expand Down
195 changes: 195 additions & 0 deletions configs/cpm/run_cpm_10b_finetune.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
seed: 0
run_mode: 'train'
output_dir: './output' # 当前不支持自定义修改,请勿修改该默认值
load_checkpoint: '/home/m30024275/cpm_model_10b.ckpt'
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
only_save_strategy: False
resume_training: False

# ==== context config ====
context:
mode: 0 #0--Graph Mode; 1--Pynative Mode
device_target: "Ascend"
enable_graph_kernel: False
graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true"
max_call_depth: 10000
max_device_memory: "30GB"
save_graphs: False
device_id: 0

# aicc
remote_save_url: "Please input obs url on AICC platform."

# ==== model config ====
model:
model_config:
type: CPMBeeConfig
vocab_size: 86592
dim_model: 4096
dim_ff: 10240
num_layers: 48
num_heads: 32
dim_head: 128
dropout_p: 0.0
position_bias_num_buckets: 256
position_bias_num_segment_buckets: 256
position_bias_max_distance: 2048
eps: 1.e-6
half: True
arch:
type: CPMForPreTraining

trainer:
type: CausalLanguageModelingTrainer
model_name: 'cpm_10b'
# if True, do evaluate during the training process. if false, do nothing.
# note that the task trainer should support _evaluate_in_training function.
do_eval: False
eval_step_interval: -1 # num of step intervals between each eval, -1 means no step end eval.
eval_epoch_interval: 1 # num of epoch intervals between each eval, 1 means eval on every epoch end.

metric:
type: ADGENMetric
tokenizer_type: "cpm_10b" # use ChatGLMTokenizer

processor:
return_tensors: ms
tokenizer:
type: CPMBeeTokenizer
type: CPMProcessor

# ==== dataset config ====
train_dataset: &train_dataset
data_loader:
type: MindDataset
dataset_dir: "/home/m30024275/cpm_mindrecord"
shuffle: True
input_columns: [ "inputs", "inputs_sub", "length", "context", "sample_ids", "num_segments", "segment",
"segment_rel_offset", "segment_rel", "spans", "ext_table_ids", "ext_table_sub", "label" ]
num_parallel_workers: 8
python_multiprocessing: False
drop_remainder: True
batch_size: 1
repeat: 1
numa_enable: False
prefetch_size: 1
seed: 0

train_dataset_task:
type: CausalLanguageModelDataset
dataset_config: *train_dataset

eval_dataset: &eval_dataset
data_loader:
type: MindDataset
dataset_dir: ""
shuffle: True
input_columns: [ "inputs", "inputs_sub", "length", "context", "sample_ids", "num_segments", "segment_ids",
"segment_rel_offset", "segment_rel", "spans", "ext_ids", "ext_sub", "target" ]
num_parallel_workers: 8
python_multiprocessing: False
drop_remainder: True
batch_size: 1
repeat: 1
numa_enable: False
prefetch_size: 1
seed: 0

eval_dataset_task:
type: CausalLanguageModelDataset
dataset_config: *eval_dataset

# ==== runner config ====
runner_config:
epochs: 1
batch_size: 1
sink_mode: False
sink_size: -1

runner_wrapper:
type: ScaleTrainOneStepCell
scale_sense:
type: DynamicLossScaleUpdateCell
loss_scale_value: 32768
scale_factor: 2
scale_window: 1000
use_clip_grad: True

# lr sechdule
lr_schedule:
type: noam
learning_rate: 1.e-4
warmup_iter: 1
end_iter: 2000

# optimizer
optimizer:
type: AdamWeightDecayWithScale
weight_decay: 0.01
param_group: False

# parallel config
use_parallel: False
parallel:
parallel_mode: 2 # 0-dataset, 1-semi, 2-auto, 3-hybrid
gradients_mean: False
loss_repeated_mean: True
enable_alltoall: False
full_batch: True
search_mode: "sharding_propagation"
enable_parallel_optimizer: True # optimizer shard
strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
parallel_config:
data_parallel: 8
model_parallel: 1
pipeline_stage: 1
expert_parallel: 1
optimizer_shard: True # optimizer shard
micro_batch_num: 1
vocab_emb_dp: True
gradient_aggregation_group: 8
micro_batch_interleave_num: 1

# moe
moe_config:
expert_num: 1
capacity_factor: 1.05
aux_loss_factor: 0.05
num_experts_chosen: 1

# recompute
recompute_config:
recompute: False
parallel_optimizer_comm_recompute: False
mp_comm_recompute: True
recompute_slice_activation: False

# autotune
auto_tune: False
filepath_prefix: './autotune'
autotune_per_step: 10

# profile
profile: False
profile_start_step: 1
profile_stop_step: 10
init_start_profile: True
profile_communication: True
profile_memory: True

# callbacks
callbacks:
- type: MFLossMonitor
- type: SummaryMonitor
keep_default_action: True
- type: CheckpointMointor
prefix: "cpm-2b"
save_checkpoint_steps: 500
keep_checkpoint_max: 2
integrated_save: False
async_save: False
- type: ObsMonitor
keep_last: False
eval_callbacks:
- type: ObsMonitor
keep_last: False
Loading