Skip to content

Commit 20d637e

Browse files
authored
fix: ExLlama Backend Context Size & Rope Scaling (#1311)
* fix: context_size not propagated to exllama backend * fix: exllama rope scaling
1 parent 480b14c commit 20d637e

File tree

1 file changed

+13
-0
lines changed

1 file changed

+13
-0
lines changed

backend/python/exllama/exllama.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,19 @@ def LoadModel(self, request, context):
6363

6464
config = ExLlamaConfig(model_config_path) # create config from config.json
6565
config.model_path = model_path # supply path to model weights file
66+
if (request.ContextSize):
67+
config.max_seq_len = request.ContextSize # override max sequence length
68+
config.max_attention_size = request.ContextSize**2 # Should be set to context_size^2.
69+
# https://github.com/turboderp/exllama/issues/220#issuecomment-1720324163
70+
71+
# Set Rope scaling.
72+
if (request.RopeFreqScale):
73+
# Alpha value for Rope scaling.
74+
# Higher value increases context but adds perplexity.
75+
# alpha_value and compress_pos_emb are mutually exclusive.
76+
# https://github.com/turboderp/exllama/issues/115
77+
config.alpha_value = request.RopeFreqScale
78+
config.calculate_rotary_embedding_base()
6679

6780
model = ExLlama(config) # create ExLlama instance and load the weights
6881
tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file

0 commit comments

Comments
 (0)