|
@@ -1,6 +1,7 @@
|
|
|
import torch
|
|
import torch
|
|
|
from peft import PeftModel
|
|
from peft import PeftModel
|
|
|
import transformers
|
|
import transformers
|
|
|
|
|
+import gradio as gr
|
|
|
|
|
|
|
|
assert (
|
|
assert (
|
|
|
"LlamaTokenizer" in transformers._import_structure["models.llama"]
|
|
"LlamaTokenizer" in transformers._import_structure["models.llama"]
|
|
@@ -43,28 +44,62 @@ def generate_prompt(instruction, input=None):
|
|
|
model.eval()
|
|
model.eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
-def evaluate(instruction, input=None, **kwargs):
|
|
|
|
|
|
|
+def evaluate(
|
|
|
|
|
+ instruction,
|
|
|
|
|
+ temperature=0.1,
|
|
|
|
|
+ top_p=0.75,
|
|
|
|
|
+ top_k=40,
|
|
|
|
|
+ num_beams=4,
|
|
|
|
|
+ input=None,
|
|
|
|
|
+ **kwargs,
|
|
|
|
|
+):
|
|
|
prompt = generate_prompt(instruction, input)
|
|
prompt = generate_prompt(instruction, input)
|
|
|
inputs = tokenizer(prompt, return_tensors="pt")
|
|
inputs = tokenizer(prompt, return_tensors="pt")
|
|
|
input_ids = inputs["input_ids"].cuda()
|
|
input_ids = inputs["input_ids"].cuda()
|
|
|
generation_config = GenerationConfig(
|
|
generation_config = GenerationConfig(
|
|
|
- temperature=0.1,
|
|
|
|
|
- top_p=0.75,
|
|
|
|
|
- num_beams=4,
|
|
|
|
|
|
|
+ temperature=temperature,
|
|
|
|
|
+ top_p=top_p,
|
|
|
|
|
+ top_k=top_k,
|
|
|
|
|
+ num_beams=num_beams,
|
|
|
**kwargs,
|
|
**kwargs,
|
|
|
)
|
|
)
|
|
|
- generation_output = model.generate(
|
|
|
|
|
- input_ids=input_ids,
|
|
|
|
|
- generation_config=generation_config,
|
|
|
|
|
- return_dict_in_generate=True,
|
|
|
|
|
- output_scores=True,
|
|
|
|
|
- max_new_tokens=2048,
|
|
|
|
|
- )
|
|
|
|
|
|
|
+ with torch.no_grad():
|
|
|
|
|
+ generation_output = model.generate(
|
|
|
|
|
+ input_ids=input_ids,
|
|
|
|
|
+ generation_config=generation_config,
|
|
|
|
|
+ return_dict_in_generate=True,
|
|
|
|
|
+ output_scores=True,
|
|
|
|
|
+ max_new_tokens=2048,
|
|
|
|
|
+ )
|
|
|
s = generation_output.sequences[0]
|
|
s = generation_output.sequences[0]
|
|
|
output = tokenizer.decode(s)
|
|
output = tokenizer.decode(s)
|
|
|
return output.split("### Response:")[1].strip()
|
|
return output.split("### Response:")[1].strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+gr.Interface(
|
|
|
|
|
+ fn=evaluate,
|
|
|
|
|
+ inputs=[
|
|
|
|
|
+ gr.components.Textbox(
|
|
|
|
|
+ lines=2, label="Instruction", placeholder="Tell me about alpacas."
|
|
|
|
|
+ ),
|
|
|
|
|
+ gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
|
|
|
|
|
+ gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"),
|
|
|
|
|
+ gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
|
|
|
|
|
+ gr.components.Slider(minimum=0, maximum=4, step=1, value=4, label="Beams"),
|
|
|
|
|
+ ],
|
|
|
|
|
+ outputs=[
|
|
|
|
|
+ gr.inputs.Textbox(
|
|
|
|
|
+ lines=5,
|
|
|
|
|
+ label="Output",
|
|
|
|
|
+ )
|
|
|
|
|
+ ],
|
|
|
|
|
+ title="🦙🌲 Alpaca-LoRA",
|
|
|
|
|
+ description="Alpaca-LoRA is a 7B-parameter LLaMA model finetuned to follow instructions. It is trained on the [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) dataset and makes use of the Huggingface LLaMA implementation. For more information, please visit [the project's website](https://github.com/tloen/alpaca-lora).",
|
|
|
|
|
+).launch(share=True)
|
|
|
|
|
+
|
|
|
|
|
+# Old testing code follows.
|
|
|
|
|
+
|
|
|
|
|
+"""
|
|
|
if __name__ == "__main__":
|
|
if __name__ == "__main__":
|
|
|
# testing code for readme
|
|
# testing code for readme
|
|
|
for instruction in [
|
|
for instruction in [
|
|
@@ -81,3 +116,4 @@ if __name__ == "__main__":
|
|
|
print("Instruction:", instruction)
|
|
print("Instruction:", instruction)
|
|
|
print("Response:", evaluate(instruction))
|
|
print("Response:", evaluate(instruction))
|
|
|
print()
|
|
print()
|
|
|
|
|
+"""
|