浏览代码

tokenizer changes

Eric Wang 3 年之前
父节点
当前提交
41e0ff6c78
共有 1 个文件被更改,包括 15 次插入15 次删除
  1. 15 15
      finetune.py

+ 15 - 15
finetune.py

@@ -16,7 +16,9 @@ model = LLaMAForCausalLM.from_pretrained(
 )
 
 
-tokenizer = LLaMATokenizer.from_pretrained("decapoda-research/llama-7b-hf")
+tokenizer = LLaMATokenizer.from_pretrained(
+    "decapoda-research/llama-7b-hf", add_eos_token=True
+)
 
 model = prepare_model_for_int8_training(model)
 
@@ -29,10 +31,7 @@ config = LoraConfig(
     task_type="CAUSAL_LM",
 )
 model = get_peft_model(model, config)
-
-tokenizer.pad_token = tokenizer.eos_token
-tokenizer.pad_token_id = tokenizer.eos_token_id
-
+tokenizer.pad_token_id = 0  # unk. we want this to be different from the eos token
 data = load_dataset("json", data_files="alpaca_data.json")
 
 
@@ -47,25 +46,27 @@ def generate_prompt(data_point):
 ### Input:
 {data_point["input"]}
 
-### Response:"""
+### Response:
+{data_point["output"]}"""
     else:
         return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
 
 ### Instruction:
 {data_point["instruction"]}
 
-### Response:"""
+### Response:
+{data_point["output"]}"""
 
 
-# optimized for RTX 4090.
-MICRO_BATCH_SIZE = 12
-BATCH_SIZE = 36
+# optimized for RTX 4090. for larger GPUs, increase some of these?
+MICRO_BATCH_SIZE = 4  # this could actually be 5 but i like powers of 2
+BATCH_SIZE = 128
 GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
-EPOCHS = 3
-LEARNING_RATE = 2e-5
-CUTOFF_LEN = 128
+EPOCHS = 3  # from the result
+LEARNING_RATE = 3e-4  # the karpathy constant
+CUTOFF_LEN = 256  # 256 accounts for about 96% of the data
 
-data = data.map(
+data = data.shuffle().map(
     lambda data_point: tokenizer(
         generate_prompt(data_point),
         truncation=True,
@@ -74,7 +75,6 @@ data = data.map(
     )
 )
 
-
 trainer = transformers.Trainer(
     model=model,
     train_dataset=data["train"],