This commit is contained in:
l.gabrysiak 2025-02-25 21:23:33 +01:00
parent 7b6dad7f2b
commit 0db71fc40d
1 changed files with 24 additions and 10 deletions

34
hft.py
View File

@ -5,17 +5,17 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments,
from datasets import Dataset from datasets import Dataset
import re import re
import json import json
import PyPDF2
import docx2txt
import pytesseract
from PIL import Image
from collections import defaultdict from collections import defaultdict
from huggingface_hub import login from huggingface_hub import login
import PyPDF2 # Dodane
import docx2txt # Dodane
import pytesseract # Dodane
from PIL import Image # Dodane
# Konfiguracja # Konfiguracja
os.environ['TORCH_USE_CUDA_DSA'] = '1' os.environ['TORCH_USE_CUDA_DSA'] = '1'
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
login(token="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") # Zastąp swoim tokenem login(token="TWÓJ_TOKEN_HF")
class SourceMapper: class SourceMapper:
def __init__(self): def __init__(self):
@ -57,6 +57,8 @@ def extract_text_from_file(file_path):
return text return text
elif ext in ['.doc', '.docx']: elif ext in ['.doc', '.docx']:
return docx2txt.process(file_path) return docx2txt.process(file_path)
elif ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']:
return pytesseract.image_to_string(Image.open(file_path))
else: else:
return "" return ""
@ -138,12 +140,23 @@ def main():
max_length=512, max_length=512,
return_tensors="pt" return_tensors="pt"
) )
tokenized["labels"] = tokenized["input_ids"].clone() return {
tokenized["source_idx"] = examples["source_idx"] "input_ids": tokenized["input_ids"],
return tokenized "attention_mask": tokenized["attention_mask"],
"labels": tokenized["input_ids"].clone(),
"source_idx": examples["source_idx"]
}
tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=8) tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=8)
def custom_collate_fn(features):
return {
"input_ids": torch.stack([torch.tensor(f["input_ids"]) for f in features]),
"attention_mask": torch.stack([torch.tensor(f["attention_mask"]) for f in features]),
"labels": torch.stack([torch.tensor(f["labels"]) for f in features]),
"source_idx": torch.tensor([f["source_idx"] for f in features], dtype=torch.long)
}
# Model # Model
config = AutoModelForCausalLM.from_pretrained(model_name).config config = AutoModelForCausalLM.from_pretrained(model_name).config
model = CustomModel(model_name, config) model = CustomModel(model_name, config)
@ -162,14 +175,15 @@ def main():
save_strategy="steps", save_strategy="steps",
save_steps=1000, save_steps=1000,
report_to="none", report_to="none",
weight_decay=0.01 weight_decay=0.01,
remove_unused_columns=False
) )
trainer = CustomTrainer( trainer = CustomTrainer(
model=model, model=model,
args=training_args, args=training_args,
train_dataset=tokenized_dataset, train_dataset=tokenized_dataset,
data_collator=lambda x: x data_collator=custom_collate_fn
) )
print("Rozpoczęcie treningu...") print("Rozpoczęcie treningu...")
trainer.train() trainer.train()