Files

243 lines
7.0 KiB
Python
Raw Permalink Normal View History

2026-03-14 22:57:45 +00:00
import os
import re
import sys
from datetime import datetime
import tkinter as tk
from tkinter import filedialog, simpledialog, messagebox
import pandas as pd
import pdfplumber
START_MARKER = "Planeamento e execução da formação "
END_MARKER = "Ação dos formadores"
def norm_text(s: str) -> str:
if not s:
return ""
s = s.replace("–", "-")
s = re.sub(r"\s+", " ", s).strip().lower()
return s
def clean_cell(x) -> str:
if x is None:
return ""
x = str(x).replace("\n", " ").replace("\r", " ").strip()
x = re.sub(r"\s+", " ", x)
return x
def convert(x):
if isinstance(x, (int, float)):
return str(x).replace(".", ",")
if isinstance(x, str):
import re
if re.fullmatch(r"-?\d+\.\d+", x):
return x.replace(".", ",")
return x
def normalize_df(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
df = df.apply(lambda col: col.map(clean_cell))
df = df.apply(lambda col: col.map(convert))
df = df.loc[
~(df.apply(lambda r: all(v == "" for v in r), axis=1))
].reset_index(drop=True)
return df
def df_flattext(df: pd.DataFrame) -> str:
vals = df.astype(str).fillna("").values.flatten().tolist()
return " ".join(vals)
def safe_filename(s: str) -> str:
s = re.sub(r"[\\/:*?\"<>|]+", "_", s)
s = re.sub(r"\s+", "_", s.strip())
return s[:180]
def dot_to_comma(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
def convert(x):
if isinstance(x, (int, float)):
return str(x).replace(".", ",")
if isinstance(x, str):
if re.fullmatch(r"-?\d+\.\d+", x):
return x.replace(".", ",")
return x
return df.apply(lambda col: col.map(convert))
def save_row_if_new(
final_df: pd.DataFrame,
xlsx_path: str,
row_name: str,
sheet_name: str = "Resultados",
):
root = tk.Tk()
root.withdraw()
if not os.path.exists(xlsx_path):
final_df.to_excel(xlsx_path, sheet_name=sheet_name, index=True)
messagebox.showinfo("OK", f"Ficheiro criado e linha guardada:\n{row_name}\n\n{xlsx_path}")
return False
try:
existing = pd.read_excel(xlsx_path, sheet_name=sheet_name, index_col=0)
except ValueError:
final_df.to_excel(xlsx_path, sheet_name=sheet_name, index=True)
messagebox.showinfo("OK", f"Sheet '{sheet_name}' criada e linha guardada:\n{row_name}")
return False
if row_name in existing.index.astype(str):
messagebox.showwarning(
"Já existe",
f"Já existe uma linha com este nome:\n{row_name}\n\nNada foi gravado.",
)
return True
updated = pd.concat([existing, final_df], axis=0)
with pd.ExcelWriter(xlsx_path, engine="openpyxl", mode="w") as writer:
updated.to_excel(writer, sheet_name=sheet_name, index=True)
messagebox.showinfo("OK", f"Linha adicionada com sucesso:\n{row_name}\n\n{xlsx_path}")
return False
def main() -> int:
root = tk.Tk()
root.withdraw()
root.update()
pdf_path = filedialog.askopenfilename(
title="Seleciona o PDF",
filetypes=[("PDF files", "*.pdf"), ("All files", "*.*")],
)
if not pdf_path:
messagebox.showerror("Erro", "Nenhum PDF selecionado.")
return 1
pdf_path = os.path.abspath(pdf_path)
course_name = simpledialog.askstring(
title="Nome do Curso",
prompt="Introduz o nome do curso:",
)
if not course_name:
messagebox.showerror("Erro", "Nome do curso nÃo foi preenchido.")
return 1
year = simpledialog.askstring(
title="Ano",
prompt="Introduz o ano (ex: 2024):",
)
if not year or not year.isdigit():
messagebox.showerror("Erro", "Ano inválido.")
return 1
start_n = norm_text(START_MARKER)
end_n = norm_text(END_MARKER)
with pdfplumber.open(pdf_path) as pdf:
start_page = None
end_page = None
for i, page in enumerate(pdf.pages):
txt = norm_text(page.extract_text() or "")
if start_n in txt:
start_page = i
break
if start_page is None:
raise ValueError(f"Não encontrei o ini­cio: {START_MARKER}")
for j in range(start_page, len(pdf.pages)):
txt = norm_text(pdf.pages[j].extract_text() or "")
if end_n in txt:
end_page = j
break
if end_page is None:
raise ValueError(f"Encontrei o inicio, mas não encontrei o fim: {END_MARKER}")
candidates = []
with pdfplumber.open(pdf_path) as pdf:
for pno in range(start_page, end_page + 1):
page = pdf.pages[pno]
tables = page.extract_tables() or []
for ti, tbl in enumerate(tables, start=1):
if not tbl or len(tbl) < 2:
continue
header = tbl[0]
body = tbl[1:]
df = pd.DataFrame(body, columns=header)
df = normalize_df(df)
r, c = df.shape
if r < 2 or c < 2:
continue
text_n = norm_text(df_flattext(df))
score = 0
if c >= 3:
score += 10
if r >= 4:
score += 10
if pno == start_page:
score += 15
for kw in ["planeamento", "execucao", "formacao", "respostas", "formadores"]:
if kw in text_n:
score += 2
empties = (df == "").sum().sum()
total = r * c
if total and empties / total > 0.6:
score -= 10
candidates.append((score, pno, ti, df))
if not candidates:
raise ValueError("Encontrei o bloco, mas não encontrei tabelas nas páginas desse intervalo.")
candidates.sort(key=lambda x: x[0], reverse=True)
best_score, best_pno, best_ti, best_df = candidates[0]
print(f"Bloco: páginas {start_page+1} até {end_page+1}")
print(f"Tabela escolhida: p{best_pno+1}_t{best_ti} (score={best_score})")
print("Dimensões:", best_df.shape)
rows_config = {
1: [0, 2],
2: [0, 2],
3: [0, 2],
5: [0, 2],
7: [0, 2],
8: [0, 2],
9: [0, 2],
10: [0, 2],
12: [0, 3],
13: [0, 3],
14: [0, 3],
15: [0, 3],
16: [0, 3],
17: [0, 3],
}
new_rows = []
for row_idx, col_idxs in rows_config.items():
values = best_df.iloc[row_idx, col_idxs].tolist()
new_rows.append(values)
new_df = pd.DataFrame(new_rows)
data_dict = dict(zip(new_df[0], new_df[1]))
final_df = pd.DataFrame([data_dict])
final_df.index = [f"{year} - {course_name}"]
final_df = dot_to_comma(final_df)
row_name = f"{year} - {course_name}"
xlsx_path = "BDpowerbi.xlsx"
save_row_if_new(final_df, xlsx_path, row_name, sheet_name="Resultados")
return 0
if __name__ == "__main__":
sys.exit(main())