Parcourir la source

Revert "fix <eos> tokenization"

This reverts commit 6b69ea866575770f37998ebced7cff22418d41dc.
Eric Wang il y a 3 ans
Parent
commit
024dde7dab
1 fichiers modifiés avec 3 ajouts et 3 suppressions
  1. 3 3
      finetune.py

+ 3 - 3
finetune.py

@@ -17,7 +17,7 @@ GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
 EPOCHS = 3  # we don't need 3 tbh
 LEARNING_RATE = 3e-4  # the Karpathy constant
 CUTOFF_LEN = 256  # 256 accounts for about 96% of the data
-LORA_R = 16
+LORA_R = 8
 LORA_ALPHA = 16
 LORA_DROPOUT = 0.05
 
@@ -27,7 +27,7 @@ model = LLaMAForCausalLM.from_pretrained(
     device_map="auto",
 )
 tokenizer = LLaMATokenizer.from_pretrained(
-    "decapoda-research/llama-7b-hf", add_eos_token=False
+    "decapoda-research/llama-7b-hf", add_eos_token=True
 )
 
 model = prepare_model_for_int8_training(model)
@@ -70,7 +70,7 @@ def generate_prompt(data_point):
 
 data = data.shuffle().map(
     lambda data_point: tokenizer(
-        generate_prompt(data_point) + tokenizer.eos_token,
+        generate_prompt(data_point),
         truncation=True,
         max_length=CUTOFF_LEN,
         padding="max_length",