mod
This commit is contained in:
parent
7b6dad7f2b
commit
0db71fc40d
34
hft.py
34
hft.py
|
|
@ -5,17 +5,17 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments,
|
||||||
from datasets import Dataset
|
from datasets import Dataset
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
|
import PyPDF2
|
||||||
|
import docx2txt
|
||||||
|
import pytesseract
|
||||||
|
from PIL import Image
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from huggingface_hub import login
|
from huggingface_hub import login
|
||||||
import PyPDF2 # Dodane
|
|
||||||
import docx2txt # Dodane
|
|
||||||
import pytesseract # Dodane
|
|
||||||
from PIL import Image # Dodane
|
|
||||||
|
|
||||||
# Konfiguracja
|
# Konfiguracja
|
||||||
os.environ['TORCH_USE_CUDA_DSA'] = '1'
|
os.environ['TORCH_USE_CUDA_DSA'] = '1'
|
||||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||||
login(token="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") # Zastąp swoim tokenem
|
login(token="TWÓJ_TOKEN_HF")
|
||||||
|
|
||||||
class SourceMapper:
|
class SourceMapper:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
@ -57,6 +57,8 @@ def extract_text_from_file(file_path):
|
||||||
return text
|
return text
|
||||||
elif ext in ['.doc', '.docx']:
|
elif ext in ['.doc', '.docx']:
|
||||||
return docx2txt.process(file_path)
|
return docx2txt.process(file_path)
|
||||||
|
elif ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']:
|
||||||
|
return pytesseract.image_to_string(Image.open(file_path))
|
||||||
else:
|
else:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
@ -138,12 +140,23 @@ def main():
|
||||||
max_length=512,
|
max_length=512,
|
||||||
return_tensors="pt"
|
return_tensors="pt"
|
||||||
)
|
)
|
||||||
tokenized["labels"] = tokenized["input_ids"].clone()
|
return {
|
||||||
tokenized["source_idx"] = examples["source_idx"]
|
"input_ids": tokenized["input_ids"],
|
||||||
return tokenized
|
"attention_mask": tokenized["attention_mask"],
|
||||||
|
"labels": tokenized["input_ids"].clone(),
|
||||||
|
"source_idx": examples["source_idx"]
|
||||||
|
}
|
||||||
|
|
||||||
tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=8)
|
tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=8)
|
||||||
|
|
||||||
|
def custom_collate_fn(features):
|
||||||
|
return {
|
||||||
|
"input_ids": torch.stack([torch.tensor(f["input_ids"]) for f in features]),
|
||||||
|
"attention_mask": torch.stack([torch.tensor(f["attention_mask"]) for f in features]),
|
||||||
|
"labels": torch.stack([torch.tensor(f["labels"]) for f in features]),
|
||||||
|
"source_idx": torch.tensor([f["source_idx"] for f in features], dtype=torch.long)
|
||||||
|
}
|
||||||
|
|
||||||
# Model
|
# Model
|
||||||
config = AutoModelForCausalLM.from_pretrained(model_name).config
|
config = AutoModelForCausalLM.from_pretrained(model_name).config
|
||||||
model = CustomModel(model_name, config)
|
model = CustomModel(model_name, config)
|
||||||
|
|
@ -162,14 +175,15 @@ def main():
|
||||||
save_strategy="steps",
|
save_strategy="steps",
|
||||||
save_steps=1000,
|
save_steps=1000,
|
||||||
report_to="none",
|
report_to="none",
|
||||||
weight_decay=0.01
|
weight_decay=0.01,
|
||||||
|
remove_unused_columns=False
|
||||||
)
|
)
|
||||||
|
|
||||||
trainer = CustomTrainer(
|
trainer = CustomTrainer(
|
||||||
model=model,
|
model=model,
|
||||||
args=training_args,
|
args=training_args,
|
||||||
train_dataset=tokenized_dataset,
|
train_dataset=tokenized_dataset,
|
||||||
data_collator=lambda x: x
|
data_collator=custom_collate_fn
|
||||||
)
|
)
|
||||||
print("Rozpoczęcie treningu...")
|
print("Rozpoczęcie treningu...")
|
||||||
trainer.train()
|
trainer.train()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue