3 年前 · 46ddd2ca85
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,5 @@ out/
 
				 13B/
			
 
				 __pycache__/
			
 
				 checkpoint**
			
 
				-minimal-llama**
			
 
				+minimal-llama**
			
 
				+upload.py
			
--- a/README.md
+++ b/README.md
@@ -1,13 +1,15 @@
 
				 ## 🦙🌲🤏 Alpaca-LoRA: Low-Rank LLaMA Instruct-Tuning
			
 
				 
			
 
				-**The code in this repo is not yet fully tested. I'm still in the process of retraining the model with the outputs included, and I make no guarantees about the results of running `generate.py`.**
			
 
				-
			
 
				-This repository contains code for reproducing the [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) results using [low-rank adaptations (LoRAs)](https://arxiv.org/pdf/2106.09685.pdf).
			
 
				-The goal is to provide an open Instruct model of similar quality to `text-davinci-003` that can run on most consumer GPUs with 8-bit quantization.
			
 
				-
			
 
				-Users will need to be ready to fork Huggingface `transformers` to access Jason Phang's [LLaMA implementation](https://github.com/huggingface/transformers/pull/21955).
			
 
				+This repository contains code for reproducing the [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) results using [low-rank adaptation (LoRA)](https://arxiv.org/pdf/2106.09685.pdf).
			
 
				+The fine-tuning runs within five hours on a consumer GPU,
			
 
				+and the LoRA weights are made available on the Huggingface model hub.
			
 
				+With Huggingface's out-of-the-box 8-bit quantization,
			
 
				+we aim to provide an Instruct model of similar quality to `text-davinci-003` that can run [on a Raspberry Pi](https://twitter.com/miolini/status/1634982361757790209). (For research.)
			
 
				+
			
 
				+Until Jason Phang's [LLaMA implementation](https://github.com/huggingface/transformers/pull/21955)
			
 
				+is merged, users will need to replace their local Huggingface `transformers` as described below.
			
 
				 For fine-tuning LoRAs we use Huggingface's [PEFT](https://github.com/huggingface/peft).
			
 
				-Included also is code to download the LLaMA foundation model from the Huggingface model hub (for research).
			
 
				+Included also is code to download the LLaMA foundation model from the Huggingface model hub. (For research.)
			
 
				 Once I've finished running the finetuning code myself, I'll put the LoRA on the Hub as well, and the code in `generate.py` should work as expected.
			
 
				 
			
 
				 ### Setup
			
@@ -36,7 +38,9 @@ PRs adapting this code to multi-GPU setups and larger models are always welcome.
 
				 
			
 
				 ### To do
			
 
				 
			
 
				-- [ ] Hyperparameter tuning
			
 
				+- [ ] Merge LoRA weights into LLaMA weights to remove inference dependency on PEFT
			
 
				+- [ ] Train/val/test split
			
 
				+- [ ] Hyperparameter tuning code
			
 
				 - [ ] Documentation for notebook
			
 
				 - [ ] Support for `13b`, `30b`, `65b`
			
 
				 - [ ] Train a version that doesn't waste tokens on the prompt header
			
--- a/finetune.py
+++ b/finetune.py
@@ -62,8 +62,8 @@ def generate_prompt(data_point):
 
				 MICRO_BATCH_SIZE = 4  # this could actually be 5 but i like powers of 2
			
 
				 BATCH_SIZE = 128
			
 
				 GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
			
 
				-EPOCHS = 1  # we don't need 3 tbh
			
 
				-LEARNING_RATE = 3e-4  # the karpathy constant
			
 
				+EPOCHS = 3  # we don't need 3 tbh
			
 
				+LEARNING_RATE = 2e-5  # from the original paper
			
 
				 CUTOFF_LEN = 256  # 256 accounts for about 96% of the data
			
 
				 
			
 
				 data = data.shuffle().map(
			
--- a/generate.py
+++ b/generate.py
@@ -1,5 +1,5 @@
 
				 from peft import PeftModel
			
 
				-from transformers import LLaMATokenizer, LLaMAForCausalLM
			
 
				+from transformers import LLaMATokenizer, LLaMAForCausalLM, GenerationConfig
			
 
				 
			
 
				 tokenizer = LLaMATokenizer.from_pretrained("decapoda-research/llama-7b-hf")
			
 
				 
			
@@ -10,13 +10,10 @@ model = LLaMAForCausalLM.from_pretrained(
 
				 )
			
 
				 model = PeftModel.from_pretrained(model, "tloen/alpaca-lora-7b")
			
 
				 
			
 
				-PROMPT = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
			
 
				+PROMPT = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
			
 
				 
			
 
				 ### Instruction:
			
 
				-Write a poem about the following topic.
			
 
				-
			
 
				-### Input:
			
 
				-Cars
			
 
				+Tell me something about alpacas.
			
 
				 
			
 
				 ### Response:"""
			
 
				 
			
@@ -24,8 +21,20 @@ inputs = tokenizer(
 
				     PROMPT,
			
 
				     return_tensors="pt",
			
 
				 )
			
 
				+input_ids = inputs["input_ids"].cuda()
			
 
				+
			
 
				+generation_config = GenerationConfig(
			
 
				+    temperature=0.6,
			
 
				+    top_p=0.95,
			
 
				+    repetition_penalty=1.15,
			
 
				+)
			
 
				+print("Generating...")
			
 
				 generation_output = model.generate(
			
 
				-    **inputs, return_dict_in_generate=True, output_scores=True, max_new_tokens=128
			
 
				+    input_ids=input_ids,
			
 
				+    generation_config=generation_config,
			
 
				+    return_dict_in_generate=True,
			
 
				+    output_scores=True,
			
 
				+    max_new_tokens=128,
			
 
				 )
			
 
				 for s in generation_output.sequences:
			
 
				     print(tokenizer.decode(s))