diff --git a/hft.py b/hft.py index a0c3288..606933a 100644 --- a/hft.py +++ b/hft.py @@ -110,16 +110,13 @@ def custom_collate_fn(batch): input_ids = torch.stack([torch.tensor(b["input_ids"]) for b in batch]) attention_mask = torch.stack([torch.tensor(b["attention_mask"]) for b in batch]) labels = torch.stack([torch.tensor(b["labels"]) for b in batch]) - source_idx = torch.tensor([b.get("source_idx", -1) for b in batch], dtype=torch.long) - #print("source_idx shape:", source_idx.shape) # Debugowanie return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels, "source_idx": source_idx} -# Zmodyfikowana klasa CustomModel -class CustomModel(AutoModelForCausalLM): # 馃數 Zmiana dziedziczenia +class CustomModel(nn.Module): def __init__(self, model_name, config): - super().__init__(config) # 馃數 Inicjalizacja klasy bazowej - self.model = AutoModelForCausalLM.from_pretrained(model_name, config=config) + super().__init__() + self.base_model = AutoModelForCausalLM.from_pretrained(model_name, config=config) self.source_embedding = nn.Embedding( num_embeddings=1000, embedding_dim=config.hidden_size, @@ -130,21 +127,24 @@ class CustomModel(AutoModelForCausalLM): # 馃數 Zmiana dziedziczenia if source_idx is not None: source_idx = torch.clamp(source_idx, 0, self.source_embedding.num_embeddings - 1) source_embeds = self.source_embedding(source_idx).unsqueeze(1).expand(-1, input_ids.size(1), -1) - inputs_embeds = self.model.get_input_embeddings()(input_ids) + source_embeds - return self.model(inputs_embeds=inputs_embeds, attention_mask=attention_mask, labels=labels, **kwargs) - return self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, **kwargs) - - # 馃數 Dodanie metody generate + inputs_embeds = self.base_model.get_input_embeddings()(input_ids) + source_embeds + outputs = self.base_model(inputs_embeds=inputs_embeds, attention_mask=attention_mask, labels=labels, **kwargs) + else: + outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, **kwargs) + return outputs + def generate(self, *args, **kwargs): - return self.model.generate(*args, **kwargs) + return self.base_model.generate(*args, **kwargs) class CustomTrainer(Trainer): def compute_loss(self, model, inputs, return_outputs=False, **kwargs): labels = inputs.pop("labels") source_idx = inputs.pop("source_idx", None) - outputs = model(**inputs, labels=labels, source_idx=source_idx) - loss = outputs.loss - return (loss, outputs) if return_outputs else loss + outputs = model(input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + labels=labels, + source_idx=source_idx) + return (outputs.loss, outputs) if return_outputs else outputs.loss # Inicjalizacja komponent贸w source_mapper = SourceMapper() @@ -160,9 +160,9 @@ tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=8) # Inicjalizacja modelu config = AutoModelForCausalLM.from_pretrained(model_name).config -#print("Vocabulary size:", config.vocab_size) model = CustomModel(model_name, config) -#model.to("cpu") # Zmienione na CPU dla debugowania +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model = model.to(device) # Konfiguracja treningu training_args = TrainingArguments( @@ -171,13 +171,12 @@ training_args = TrainingArguments( per_device_train_batch_size=2, gradient_accumulation_steps=4, learning_rate=2e-5, - fp16=False, # Wy艂膮czone dla CPU + fp16=torch.cuda.is_available(), logging_steps=1, - logging_dir="./logs", save_strategy="steps", save_steps=1000, logging_strategy="no", - report_to="none", + report_to="none" ) # Trening @@ -189,91 +188,9 @@ trainer = CustomTrainer( ) trainer.train() -# Utw贸rz katalog do zapisu modelu -save_directory = "./trained_model/ably.do/hse" -os.makedirs(save_directory, exist_ok=True) - -# 1. Zapisz wag臋 modelu -torch.save(model.state_dict(), os.path.join(save_directory, "hse-nano-mistral.bin")) - -# 2. Zapisz tokenizer -tokenizer.save_pretrained(save_directory) - -# 3. Zapisz mapowanie 藕r贸de艂 -source_mapper_data = { - "source_to_idx": dict(source_mapper.source_to_idx), - "idx_to_source": source_mapper.idx_to_source -} - -with open(os.path.join(save_directory, "source_mapper.json"), 'w') as f: - json.dump(source_mapper_data, f) - -# 4. Zapisz konfiguracj臋 modelu (opcjonalnie, ale zalecane) -model.base_model.config.save_pretrained(save_directory) - -# Funkcja generuj膮ca odpowied藕 +# Funkcja testuj膮ca def generate_answer_with_source(question, model, tokenizer, source_mapper, max_length=200): device = next(model.parameters()).device - inputs = tokenizer( - question, - return_tensors="pt", - truncation=True, - max_length=512 - ).to(device) - - with torch.no_grad(): - outputs = model.generate( - **inputs, - max_length=max_length, - num_return_sequences=1, - return_dict_in_generate=True, - temperature=0.7, - top_p=0.9, - ) - - answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True) - - # Ekstrakcja informacji o 藕r贸d艂ach - article_matches = re.finditer(r'Art\.\s+\d+', answer) - sources = set() - - for match in article_matches: - article_ref = match.group(0).strip() - for idx, source in source_mapper.idx_to_source.items(): - if article_ref in source: - sources.add(source) - break - - return { - "question": question, - "answer": answer, - "sources": list(sources) if sources else ["Opracowanie w艂asne"], - "num_tokens": len(outputs.sequences[0]) - } - - - -# Przyk艂adowe testy -test_cases = [ - "Jaki jest wymiar urlopu wypoczynkowego?", - "Jakie s膮 zasady bezpiecze艅stwa na budowie?", - "Wyja艣nij procedur臋 zwolnienia grupowego", - "Co reguluje ustawa o ochronie danych osobowych?", - "Jakie dokumenty s膮 potrzebne do zawarcia umowy o prac臋?" -] - -print("\n\n馃敶 馃敶 馃敶 ROZPOCZ臉CIE TESTOWANIA MODELU 馃敶 馃敶 馃敶") -for case in test_cases: - result = generate_answer_with_source(case, model, tokenizer, source_mapper) - print(f"\n馃敺 Pytanie: {result['question']}") - print(f"馃敺 Odpowied藕 ({result['num_tokens']} token贸w):") - print(result['answer']) - print(f"馃敺 殴r贸d艂a: {', '.join(result['sources'])}") - print("-"*80) - -# Funkcja generuj膮ca odpowied藕 -def generate_answer(question, max_length=200): - model.eval() inputs = tokenizer(question, return_tensors="pt", truncation=True, max_length=512).to(device) with torch.no_grad(): @@ -281,12 +198,42 @@ def generate_answer(question, max_length=200): **inputs, max_length=max_length, num_return_sequences=1, - return_dict_in_generate=True + temperature=0.7, + top_p=0.9, + pad_token_id=tokenizer.eos_token_id ) - - answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True) - return answer -# Utw贸rz katalog do zapisu modelu -save_directory = "./trained_model/ably.do/hse" -os.makedirs(save_directory, exist_ok=True) \ No newline at end of file + answer = tokenizer.decode(outputs[0], skip_special_tokens=True) + + # Wyszukiwanie 藕r贸de艂 + sources = set() + for idx in source_mapper.idx_to_source: + if source_mapper.idx_to_source[idx] in answer: + sources.add(source_mapper.idx_to_source[idx]) + + return { + "question": question, + "answer": answer, + "sources": list(sources) if sources else ["Opracowanie w艂asne"] + } + +# Testowanie +test_questions = [ + "Jaki jest wymiar urlopu wypoczynkowego?", + "Jakie s膮 zasady bezpiecze艅stwa na budowie?", + "Wyja艣nij procedur臋 zwolnienia grupowego" +] + +print("\n=== TEST MODELU ===") +for question in test_questions: + result = generate_answer_with_source(question, model, tokenizer, source_mapper) + print(f"\nPytanie: {result['question']}") + print(f"Odpowied藕: {result['answer']}") + print(f"殴r贸d艂a: {', '.join(result['sources'])}") + print("="*80) + +# Zapis modelu +save_directory = "./trained_model" +os.makedirs(save_directory, exist_ok=True) +torch.save(model.state_dict(), os.path.join(save_directory, "model.bin")) +tokenizer.save_pretrained(save_directory) \ No newline at end of file