3030from torch .utils .data import DataLoader
3131from tqdm import tqdm
3232from transformers import AutoModelForCausalLM , AutoTokenizer , BitsAndBytesConfig
33+ from transformers .models .auto .modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
3334
3435from lighteval .data import GenerativeTaskDataset , LoglikelihoodDataset , LoglikelihoodSingleTokenDataset
3536from lighteval .logging .hierarchical_logger import hlog , hlog_err , hlog_warn
5758
5859
5960if is_accelerate_available ():
61+ from accelerate import Accelerator
6062 from accelerate .utils import calculate_maximum_sizes , convert_bytes , get_max_memory
6163
6264os .environ ["TOKENIZERS_PARALLELISM" ] = "false"
6769class BaseModel (LightevalModel ):
6870 def __init__ (
6971 self ,
70- config : BaseModelConfig ,
7172 env_config : EnvConfig ,
73+ config : BaseModelConfig ,
7274 ):
7375 """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation."""
7476 self ._config = config .init_configs (env_config )
@@ -114,6 +116,72 @@ def __init__(
114116
115117 self .pairwise_tokenization = config .pairwise_tokenization
116118
119+ @classmethod
120+ def from_model (
121+ cls ,
122+ model : Union [AutoModelForCausalLM , LightevalModel ],
123+ env_config : EnvConfig ,
124+ accelerator : "Accelerator" = None ,
125+ tokenizer_name : str = None , # custom tokenizer
126+ trust_remote_code : bool = False ,
127+ use_chat_template : bool = False ,
128+ add_special_tokens : bool = True ,
129+ pairwise_tokenization : bool = False ,
130+ multichoice_continuations_start_space : bool = None ,
131+ ):
132+ # Slightly hackish way to test if the model is a AutoModelForCausalLM, since the instances don't
133+ # derive from this class explicitely
134+ assert isinstance (model , LightevalModel ) or type (model ).__name__ in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES .values ()
135+
136+ if isinstance (model , LightevalModel ):
137+ return model
138+
139+ # Instanciate the object without using __init__
140+ self = cls .__new__ (cls )
141+ self ._config = model .config
142+ self ._max_length = self ._init_max_length (max_length = model .config .max_length )
143+ self ._tokenizer = self ._create_auto_tokenizer_with_name (
144+ model_name = model .name_or_path ,
145+ revision = model .config ._commit_hash ,
146+ env_config = env_config ,
147+ trust_remote_code = trust_remote_code ,
148+ tokenizer_name = tokenizer_name ,
149+ )
150+ self .model_name = _simplify_name (model .name_or_path )
151+ self .model_sha = model .config ._commit_hash
152+
153+ # If model_parallel is not set we compare the number of processes with the number of GPUs
154+ self .model = model
155+ self .model .eval ()
156+ torch .set_grad_enabled (False )
157+
158+ self .accelerator = accelerator
159+ if accelerator is not None :
160+ self ._device = accelerator .device
161+ self .model = self .accelerator .prepare (self .model .to (accelerator .device ))
162+ else :
163+ self ._device = "cpu"
164+
165+ self .use_chat_template = use_chat_template
166+ self ._add_special_tokens = add_special_tokens if add_special_tokens is not None else False
167+ self .pairwise_tokenization = pairwise_tokenization
168+ self .multichoice_continuations_start_space = multichoice_continuations_start_space
169+
170+ self .precision = _get_dtype (model .dtype , config = self ._config )
171+
172+ if is_accelerate_available ():
173+ model_size , _ = calculate_maximum_sizes (self .model )
174+ model_size = convert_bytes (model_size )
175+ else :
176+ model_size = - 1
177+ self .model_info = ModelInfo (
178+ model_name = self .model_name ,
179+ model_sha = self .model_sha ,
180+ model_dtype = self .precision ,
181+ model_size = model_size ,
182+ )
183+ return self
184+
117185 @property
118186 def tokenizer (self ):
119187 return self ._tokenizer
@@ -207,10 +275,23 @@ def _create_auto_model(self, config: BaseModelConfig, env_config: EnvConfig) ->
207275 def _create_auto_tokenizer (
208276 self , config : BaseModelConfig , env_config : EnvConfig
209277 ) -> transformers .PreTrainedTokenizer :
210- return self ._create_auto_tokenizer_with_name (config .pretrained , config = config , env_config = env_config )
278+ return self ._create_auto_tokenizer_with_name (
279+ model_name = config .pretrained ,
280+ revision = config .revision ,
281+ env_config = env_config ,
282+ tokenizer_name = config .tokenizer ,
283+ subfolder = config .subfolder ,
284+ trust_remote_code = config .trust_remote_code ,
285+ )
211286
212287 def _create_auto_tokenizer_with_name (
213- self , model_name : str , config : BaseModelConfig , env_config : EnvConfig
288+ self ,
289+ model_name : str ,
290+ revision : str ,
291+ env_config : EnvConfig ,
292+ tokenizer_name : str = None ,
293+ subfolder : str = None ,
294+ trust_remote_code : bool = False ,
214295 ) -> transformers .PreTrainedTokenizer :
215296 """
216297 Create a Hugging Face AutoTokenizer for language model.
@@ -231,25 +312,35 @@ def _create_auto_tokenizer_with_name(
231312 """
232313 try :
233314 tokenizer = AutoTokenizer .from_pretrained (
234- model_name if config . tokenizer is None else config . tokenizer ,
235- revision = config . revision + (f"/{ config . subfolder } " if config . subfolder is not None else "" ),
315+ model_name if tokenizer_name is None else tokenizer_name ,
316+ revision = revision + (f"/{ subfolder } " if subfolder is not None else "" ),
236317 cache_dir = env_config .cache_dir ,
237318 token = env_config .token ,
238- trust_remote_code = config . trust_remote_code ,
319+ trust_remote_code = trust_remote_code ,
239320 padding_side = "left" ,
240321 truncation_side = "left" ,
241322 )
242323 except RecursionError :
243324 tokenizer = AutoTokenizer .from_pretrained (
244- model_name if config . tokenizer is None else config . tokenizer ,
245- revision = config . revision + (f"/{ config . subfolder } " if config . subfolder is not None else "" ),
325+ model_name if tokenizer_name is None else tokenizer_name ,
326+ revision = revision + (f"/{ subfolder } " if subfolder is not None else "" ),
246327 cache_dir = env_config .cache_dir ,
247328 token = env_config .token ,
248- trust_remote_code = config . trust_remote_code ,
329+ trust_remote_code = trust_remote_code ,
249330 unk_token = "<unk>" ,
250331 padding_side = "left" ,
251332 truncation_side = "left" ,
252333 )
334+ except FileNotFoundError :
335+ hlog_warn ("Problem when loading the tokenizer in the cache - discarding the provided cache path value." )
336+ tokenizer = AutoTokenizer .from_pretrained (
337+ model_name if tokenizer_name is None else tokenizer_name ,
338+ revision = revision + (f"/{ subfolder } " if subfolder is not None else "" ),
339+ token = env_config .token ,
340+ trust_remote_code = trust_remote_code ,
341+ padding_side = "left" ,
342+ truncation_side = "left" ,
343+ )
253344 tokenizer .pad_token = tokenizer .eos_token
254345 tokenizer .model_max_length = self .max_length
255346 hlog ("Tokenizer truncation and padding size set to the left side." )
0 commit comments