Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion RecommenderSystems/dlrm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
def get_args(print_args=True):
def int_list(x):
return list(map(int, x.split(",")))

def str_list(x):
return list(map(str, x.split(",")))
parser = argparse.ArgumentParser()

parser.add_argument("--bottom_mlp", type=int_list, default="512,256,128")
Expand Down Expand Up @@ -81,6 +82,17 @@ def int_list(x):
"--embedding_type", type=str, default="OneEmbedding", help="OneEmbedding or Embedding"
)
parser.add_argument("--embedding_split_axis", type=int, default=-1, help="-1: no split")
parser.add_argument("--column_size_array", type=int_list, help="column_size_array")
parser.add_argument(
"--blocked_based_path", type=str, default="", help="path for kv store"
)
parser.add_argument(
"--cache_policy", type=str_list, default="lru,none"
)
parser.add_argument("--cache_memory_budget_mb", type=int_list, default="16384,16384", help="cache_memory_budget_mb")
parser.add_argument(
"--value_memory_kind", type=str_list, default="device,host"
)
parser.add_argument(
"--test_name", type=str, default="noname_test"
)
Expand Down
72 changes: 59 additions & 13 deletions RecommenderSystems/dlrm/models/dlrm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@
from oneflow.framework.tensor import _xor
import oneflow.nn as nn
from typing import Any

import numpy as np

import os

__all__ = ["make_dlrm_module"]

Expand Down Expand Up @@ -133,27 +132,74 @@ def forward(self, ids):
return embeddings.to_consistent(sbp=flow.sbp.split(0), grad_sbp=flow.sbp.split(2))
else:
return embeddings

class OneEmbedding(nn.OneEmbeddingLookup):
def __init__(self, vocab_size, embed_size, args):
assert args.column_size_array is not None
scales = np.sqrt(1 / np.array(args.column_size_array))
initializer_list = []
for i in range(scales.size):
initializer_list.append(
{"initializer": {"type": "uniform", "low": -scales[i], "high": scales[i],}}
)
cache_list = []
assert len(args.cache_policy) <= 2
assert len(args.cache_policy) == len(args.cache_memory_budget_mb)
assert len(args.cache_policy) == len(args.value_memory_kind)
for i in range(len(args.cache_policy)):
if args.cache_policy[i] != "none":
cache = {
"policy": args.cache_policy[i],
"cache_memory_budget_mb": args.cache_memory_budget_mb[i],
"value_memory_kind": args.value_memory_kind[i]
}
cache_list.append(cache)
print("cache_list", cache_list)
options = {
"name": "my_embedding",
# Can't change the embedding_size 128 because the kv store value_length has been set to 128
"embedding_size": embed_size,
"dtype": flow.float,
"embedding_options": '{"embedding_size": embed_size, "embedding_name":"EmbeddingTest"}',
"name": "my_embedding",
"embedding_dim": embed_size,
"cache" : cache_list,
"kv_store": {
"persistent_table": {
"path": args.blocked_based_path,
"physical_block_size": 512,
},
},
"default_initializer": {"type": "normal", "mean": 0, "std": 1},
"columns": initializer_list,
"optimizer": {
"lr": {
"base_lr": 24,
"decay": {
"type": "polynomial",
"decay_batches": 27772,
"end_lr": 0.0,
"power": 2.0,
"cycle": False,
},
"warmup": {
"type": "linear",
"warmup_batches": 2750,
"start_multiplier": 0.0,
},
},
"type": "sgd",
"momentum": 0.0,
"betas": [0.9, 0.999],
"eps": 1e-8,
},
}
super(OneEmbedding, self).__init__(options)
slots = flow.tensor(range(26), dtype=flow.int32).reshape(1,26)
self.register_buffer("slots", slots)
column_id = flow.tensor(range(26), dtype=flow.int32).reshape(1,26)
self.register_buffer("column_id", column_id)

def forward(self, ids):
bsz = ids.shape[0]
slots = flow.ones((bsz, 1), dtype=flow.int32, sbp=ids.sbp, placement=ids.placement) * self.slots
column_id = flow.ones((bsz, 1), dtype=flow.int32, sbp=ids.sbp, placement=ids.placement) * self.column_id
if (ids.is_consistent):
slots = slots.to_consistent(sbp=ids.sbp, placement=ids.placement)
return super(OneEmbedding, self._origin).forward(ids, slots)

column_id = column_id.to_consistent(sbp=ids.sbp, placement=ids.placement)
return super(OneEmbedding, self._origin).forward(ids, column_id)
def set_model_parallel(self, placement=None):
pass

Expand Down
51 changes: 51 additions & 0 deletions RecommenderSystems/dlrm/train_one_embedding_graph.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
rm core.*
DEVICE_NUM_PER_NODE=4
MASTER_ADDR=127.0.0.1
NUM_NODES=1
NODE_RANK=0
DATA_DIR=/data/xiexuan/criteo1t_ofrecord_shuffle_per_day/
EMBD_SIZE=33762578
BATHSIZE=55296
export GLOG_minloglevel=2
ulimit -SHn 131072
eval_batch_size=32744
eval_batchs=$(( 3274330 / eval_batch_size ))
#eval_batchs=$(( 90243072 / eval_batch_size ))

export LD_PRELOAD=/lib/x86_64-linux-gnu/libtcmalloc.so.4:
export BLOCK_BASED_PATH="rocks"
echo "ll BLOCK_BASED_PATH"
ls -l $BLOCK_BASED_PATH
rm -rf rocks/0-4/*
rm -rf rocks/1-4/*
rm -rf rocks/2-4/*
rm -rf rocks/3-4/*

#/usr/local/cuda-11.4/bin/nsys profile --stat=true \
#numactl --interleave=all \
python3 -m oneflow.distributed.launch \
--nproc_per_node $DEVICE_NUM_PER_NODE \
--nnodes $NUM_NODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
train.py \
--interaction_type dot \
--embedding_type OneEmbedding \
--learning_rate 24 \
--batch_size $BATHSIZE \
--data_dir $DATA_DIR \
--loss_print_every_n_iter 100 \
--eval_interval 1000 \
--eval_batchs $eval_batchs \
--eval_batch_size $eval_batch_size \
--max_iter 75868 \
--vocab_size $EMBD_SIZE \
--data_part_num 5888 \
--data_part_name_suffix_length 5 \
--execution_mode 'graph' \
--cache_policy 'lru,none' \
--cache_memory_budget_mb '16384,163840' \
--value_memory_kind 'device,host' \
--blocked_based_path $BLOCK_BASED_PATH \
--column_size_array '227605432,39060,17295,7424,20265,3,7122,1543,63,130229467,3067956,405282,10,2209,11938,155,4,976,14,292775614,40790948,187188510,590152,12973,108,36' \
--test_name 'train_one_embedding_graph_'$DEVICE_NUM_PER_NODE'gpu' | tee 'train_one_embedding_graph_'$DEVICE_NUM_PER_NODE'gpu'.log