finetune.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. import os
  2. # os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  3. import torch
  4. import torch.nn as nn
  5. import bitsandbytes as bnb
  6. from datasets import load_dataset
  7. import transformers
  8. assert (
  9. "LlamaTokenizer" in transformers._import_structure["models.llama"]
  10. ), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git"
  11. from transformers import LlamaForCausalLM, LlamaTokenizer
  12. from peft import (
  13. prepare_model_for_int8_training,
  14. LoraConfig,
  15. get_peft_model,
  16. get_peft_model_state_dict,
  17. )
  18. # optimized for RTX 4090. for larger GPUs, increase some of these?
  19. MICRO_BATCH_SIZE = 4 # this could actually be 5 but i like powers of 2
  20. BATCH_SIZE = 128
  21. GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
  22. EPOCHS = 3 # we don't need 3 tbh
  23. LEARNING_RATE = 3e-4 # the Karpathy constant
  24. CUTOFF_LEN = 256 # 256 accounts for about 96% of the data
  25. LORA_R = 8
  26. LORA_ALPHA = 16
  27. LORA_DROPOUT = 0.05
  28. VAL_SET_SIZE = 2000
  29. model = LlamaForCausalLM.from_pretrained(
  30. "decapoda-research/llama-7b-hf",
  31. load_in_8bit=True,
  32. device_map="auto",
  33. )
  34. tokenizer = LlamaTokenizer.from_pretrained(
  35. "decapoda-research/llama-7b-hf", add_eos_token=True
  36. )
  37. model = prepare_model_for_int8_training(model)
  38. config = LoraConfig(
  39. r=LORA_R,
  40. lora_alpha=LORA_ALPHA,
  41. target_modules=["q_proj", "v_proj"],
  42. lora_dropout=LORA_DROPOUT,
  43. bias="none",
  44. task_type="CAUSAL_LM",
  45. )
  46. model = get_peft_model(model, config)
  47. tokenizer.pad_token_id = 0 # unk. we want this to be different from the eos token
  48. data = load_dataset("json", data_files="alpaca_data.json")
  49. train_val = data["train"].train_test_split(
  50. test_size=VAL_SET_SIZE, shuffle=True, seed=42
  51. )
  52. train_data = train_val["train"]
  53. val_data = train_val["test"]
  54. def generate_prompt(data_point):
  55. # sorry about the formatting disaster gotta move fast
  56. if data_point["input"]:
  57. return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
  58. ### Instruction:
  59. {data_point["instruction"]}
  60. ### Input:
  61. {data_point["input"]}
  62. ### Response:
  63. {data_point["output"]}"""
  64. else:
  65. return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
  66. ### Instruction:
  67. {data_point["instruction"]}
  68. ### Response:
  69. {data_point["output"]}"""
  70. def tokenize(prompt):
  71. # there's probably a way to do this with the tokenizer settings
  72. # but again, gotta move fast
  73. result = tokenizer(
  74. prompt,
  75. truncation=True,
  76. max_length=CUTOFF_LEN + 1,
  77. padding="max_length",
  78. )
  79. return {
  80. "input_ids": result["input_ids"][:-1],
  81. "attention_mask": result["attention_mask"][:-1],
  82. }
  83. train_data = train_data.shuffle().map(lambda x: tokenize(generate_prompt(x)))
  84. val_data = val_data.shuffle().map(lambda x: tokenize(generate_prompt(x)))
  85. trainer = transformers.Trainer(
  86. model=model,
  87. train_dataset=train_data,
  88. eval_dataset=val_data,
  89. args=transformers.TrainingArguments(
  90. per_device_train_batch_size=MICRO_BATCH_SIZE,
  91. gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
  92. warmup_steps=100,
  93. num_train_epochs=EPOCHS,
  94. learning_rate=LEARNING_RATE,
  95. fp16=True,
  96. logging_steps=20,
  97. evaluation_strategy="steps",
  98. save_strategy="steps",
  99. eval_steps=200,
  100. save_steps=200,
  101. output_dir="lora-alpaca",
  102. save_total_limit=3,
  103. load_best_model_at_end=True,
  104. ),
  105. data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
  106. )
  107. model.config.use_cache = False
  108. old_state_dict = model.state_dict
  109. model.state_dict = (
  110. lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())
  111. ).__get__(model, type(model))
  112. trainer.train()
  113. model.save_pretrained("lora-alpaca")
  114. print("\n If there's a warning about missing keys above, please disregard :)")