diff --git a/herbert.py b/herbert.py index f15f81d..d313d32 100644 --- a/herbert.py +++ b/herbert.py @@ -6,21 +6,28 @@ import faiss import numpy as np from sentence_transformers import SentenceTransformer from datasets import Dataset -from peft import LoraConfig, get_peft_model, PeftModel -from transformers import (AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, - DataCollatorForLanguageModeling, LlamaTokenizer, LlamaForCausalLM) -import bitsandbytes as bnb +from peft import LoraConfig, get_peft_model +from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling # 1️⃣ Inicjalizacja modelu do embeddingów embed_model = SentenceTransformer("all-MiniLM-L6-v2") -# 2️⃣ Wczytanie dokumentów i embeddingów +# 2️⃣ Dodanie dokumentów i embeddingów def read_documents_from_file(file_path): with open(file_path, 'r', encoding='utf-8') as file: content = file.read() articles = content.split('\n\n') - return [article.strip() for article in articles if article.strip().startswith('Art.')] - + documents = [] + for article in articles: + if article.strip().startswith('Art.'): + documents.append(article.strip()) + return documents +#documents = [ +# "Jak założyć firmę w Polsce?", +# "Jak rozliczyć podatek VAT?", +# "Procedura składania reklamacji w e-sklepie.", +# "Jakie dokumenty są potrzebne do rejestracji działalności?" +#] file_path = './docs/kodekspracy.txt' # Zmień na właściwą ścieżkę documents = read_documents_from_file(file_path) embeddings = embed_model.encode(documents) @@ -32,31 +39,41 @@ index.add(np.array(embeddings, dtype=np.float32)) # 4️⃣ Przygotowanie danych treningowych def create_training_data(): - return Dataset.from_dict({"text": documents, "embedding": embeddings.tolist()}) + data = { + "text": documents, + "embedding": embeddings.tolist() + } + return Dataset.from_dict(data) dataset = create_training_data() + +# Podział danych na treningowe i ewaluacyjne split_dataset = dataset.train_test_split(test_size=0.25) -train_dataset, eval_dataset = split_dataset["train"], split_dataset["test"] +train_dataset = split_dataset["train"] +eval_dataset = split_dataset["test"] -# 5️⃣ Ładowanie modelu bazowego i fine-tunowanego -base_model = "decapoda-research/llama-7b-hf" -finetuned_model = "mmosiolek/polpaca-lora-7b" - -tokenizer = LlamaTokenizer.from_pretrained(base_model) -tokenizer.pad_token_id = 0 -tokenizer.padding_side = "left" - -model = LlamaForCausalLM.from_pretrained(base_model, torch_dtype=torch.float16).to("cuda") -model = PeftModel.from_pretrained(model, finetuned_model).to("cuda") +# 5️⃣ Ładowanie modelu Gemma 2B +device = "cuda" if torch.cuda.is_available() else "cpu" +model_name = "Lajonbot/vicuna-7b-v1.5-PL-lora_unload" +model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device) +tokenizer = AutoTokenizer.from_pretrained(model_name) # 6️⃣ Konfiguracja LoRA lora_config = LoraConfig( - r=8, lora_alpha=32, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM") + r=8, lora_alpha=32, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM" +) model = get_peft_model(model, lora_config) -# 7️⃣ Tokenizacja +# 7️⃣ Tokenizacja danych +max_length = 384 + def tokenize_function(examples): - return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=384) + return tokenizer( + examples["text"], + padding="max_length", + truncation=True, + max_length=max_length + ) tokenized_train = train_dataset.map(tokenize_function, batched=True) tokenized_eval = eval_dataset.map(tokenize_function, batched=True) @@ -64,35 +81,39 @@ tokenized_eval = eval_dataset.map(tokenize_function, batched=True) # 8️⃣ Parametry treningu training_args = TrainingArguments( output_dir="./results", - evaluation_strategy="steps", - eval_steps=500, - save_strategy="steps", - save_steps=500, + eval_strategy="steps", # Ewaluacja co określoną liczbę kroków + eval_steps=500, # Ewaluacja co 500 kroków + save_strategy="steps", # Zapis modelu co określoną liczbę kroków + save_steps=500, # Zapis modelu co 500 kroków learning_rate=1e-5, per_device_train_batch_size=2, per_device_eval_batch_size=2, num_train_epochs=16, weight_decay=0.01, - load_best_model_at_end=True, - metric_for_best_model="loss", - greater_is_better=False, + load_best_model_at_end=True, # Wczytaj najlepszy model na końcu + metric_for_best_model="loss", # Kryterium wyboru najlepszego modelu + greater_is_better=False, # Niższy loss = lepszy model ) # 9️⃣ Data Collator -data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) +data_collator = DataCollatorForLanguageModeling( + tokenizer=tokenizer, + mlm=False +) -# 🔟 Trening +# 🔟 Trening modelu trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_train, - eval_dataset=tokenized_eval, + eval_dataset=tokenized_eval, # Dodany zestaw ewaluacyjny data_collator=data_collator, ) + trainer.train() -# 1️⃣1️⃣ Zapis modelu lokalnie -model.save_pretrained("./models/finetuned_llama") -tokenizer.save_pretrained("./models/finetuned_llama") +# 1️⃣1️⃣ Zapis modelu +model.save_pretrained("./models/herbert") +tokenizer.save_pretrained("./models/herbert") print("✅ Model został wytrenowany i zapisany!") \ No newline at end of file