Files

230 lines
8.0 KiB
Python
Raw Permalink Normal View History

2026-03-14 22:57:45 +00:00
import os
import re
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
from datetime import datetime
import pandas as pd
import pdfplumber
START_MARKER = "Planeamento e execução da formação Respostas:"
END_MARKER = "Ação dos formadores"
def clean_cell(x) -> str:
if x is None:
return ""
x = str(x).replace("\n", " ").replace("\r", " ").strip()
x = re.sub(r"\s+", " ", x)
return x
def normalize_df(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
df = df.applymap(clean_cell)
df = df.loc[~(df.apply(lambda r: all(v == "" for v in r), axis=1))].reset_index(drop=True)
return df
def df_text(df: pd.DataFrame) -> str:
vals = df.astype(str).fillna("").values.flatten().tolist()
return " ".join(vals)
def safe_filename(s: str) -> str:
s = re.sub(r"[\\/:*?\"<>|]+", "_", s)
s = re.sub(r"\s+", "_", s.strip())
return s[:180] if len(s) > 180 else s
def save_outputs(df: pd.DataFrame, pdf_path: str, year: str, course: str, out_dir: str):
os.makedirs(out_dir, exist_ok=True)
stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
base = f"{safe_filename(os.path.splitext(os.path.basename(pdf_path))[0])}__{safe_filename(course)}__{safe_filename(year)}__{stamp}"
csv_path = os.path.join(out_dir, base + ".csv")
xlsx_path = os.path.join(out_dir, base + ".xlsx")
df.to_csv(csv_path, index=False, encoding="utf-8-sig")
df.to_excel(xlsx_path, index=False)
return csv_path, xlsx_path
def normalize_marker(s: str) -> str:
s = s.replace("", "-")
s = re.sub(r"\s+", " ", s).strip().lower()
return s
def find_target_table(pdf_path: str):
start_n = normalize_marker(START_MARKER)
end_n = normalize_marker(END_MARKER)
with pdfplumber.open(pdf_path) as pdf:
start_page_idx = None
for i, page in enumerate(pdf.pages):
text = page.extract_text() or ""
tnorm = normalize_marker(text)
if start_n in tnorm:
start_page_idx = i
break
if start_page_idx is None:
raise ValueError(f"Não encontrei o início do bloco: '{START_MARKER}'")
end_page_idx = None
for j in range(start_page_idx, len(pdf.pages)):
text = pdf.pages[j].extract_text() or ""
tnorm = normalize_marker(text)
if end_n in tnorm:
end_page_idx = j
break
if end_page_idx is None:
raise ValueError(f"Encontrei o início, mas não encontrei o fim do bloco: '{END_MARKER}'")
candidate_tables = []
for pno in range(start_page_idx, end_page_idx + 1):
page = pdf.pages[pno]
tables = page.extract_tables() or []
page_text = normalize_marker(page.extract_text() or "")
for ti, tbl in enumerate(tables, start=1):
if not tbl or len(tbl) < 2:
continue
df = pd.DataFrame(tbl[1:], columns=tbl[0])
df = normalize_df(df)
raw = df_text(df)
raw_n = normalize_marker(raw)
score = 0
r, c = df.shape
if c >= 3: score += 10
if r >= 4: score += 10
if pno == start_page_idx:
score += 15
for kw in ["respostas", "planeamento", "execução", "formação", "formadores"]:
if kw in raw_n:
score += 2
empties = (df == "").sum().sum()
total = r * c
if total and empties / total > 0.6:
score -= 10
candidate_tables.append((score, pno, ti, df))
if not candidate_tables:
raise ValueError("Encontrei o bloco, mas não encontrei tabelas nesse intervalo.")
candidate_tables.sort(key=lambda x: x[0], reverse=True)
best_score, best_pno, best_ti, best_df = candidate_tables[0]
return {
"start_page": start_page_idx + 1,
"end_page": end_page_idx + 1,
"best_page": best_pno + 1,
"best_table_id": f"p{best_pno+1}_t{best_ti}",
"score": best_score,
"df": best_df
}
class App(tk.Tk):
def __init__(self):
super().__init__()
self.title("Extrair Tabela do PDF (bloco entre títulos)")
self.geometry("820x300")
self.resizable(False, False)
self.pdf_path = tk.StringVar(value="")
self.year_var = tk.StringVar(value="")
self.course_var = tk.StringVar(value="")
self.outdir_var = tk.StringVar(value="output_tables")
self._build_ui()
def _build_ui(self):
frm = ttk.Frame(self, padding=14)
frm.pack(fill="both", expand=True)
# PDF
ttk.Label(frm, text="PDF:").grid(row=0, column=0, sticky="w")
ttk.Entry(frm, textvariable=self.pdf_path, width=74).grid(row=0, column=1, sticky="w", padx=(8, 8))
ttk.Button(frm, text="Escolher...", command=self.on_browse_pdf).grid(row=0, column=2, sticky="w")
# Ano
ttk.Label(frm, text="Ano:").grid(row=1, column=0, sticky="w", pady=(12, 0))
ttk.Entry(frm, textvariable=self.year_var, width=20).grid(row=1, column=1, sticky="w", padx=(8, 0), pady=(12, 0))
# Curso
ttk.Label(frm, text="Curso:").grid(row=2, column=0, sticky="w", pady=(12, 0))
ttk.Entry(frm, textvariable=self.course_var, width=50).grid(row=2, column=1, sticky="w", padx=(8, 0), pady=(12, 0))
# Output dir
ttk.Label(frm, text="Pasta de saída:").grid(row=3, column=0, sticky="w", pady=(12, 0))
ttk.Entry(frm, textvariable=self.outdir_var, width=40).grid(row=3, column=1, sticky="w", padx=(8, 0), pady=(12, 0))
# info sobre markers
info = (
f"Vai extrair a tabela do bloco:\n"
f"INÍCIO: {START_MARKER}\n"
f"FIM: {END_MARKER}"
)
ttk.Label(frm, text=info).grid(row=4, column=0, columnspan=3, sticky="w", pady=(14, 0))
# Botões
btns = ttk.Frame(frm)
btns.grid(row=5, column=0, columnspan=3, sticky="e", pady=(18, 0))
ttk.Button(btns, text="Extrair", command=self.on_extract).pack(side="right", padx=(8, 0))
ttk.Button(btns, text="Sair", command=self.destroy).pack(side="right")
def on_browse_pdf(self):
path = filedialog.askopenfilename(
title="Seleciona um PDF",
filetypes=[("PDF files", "*.pdf"), ("All files", "*.*")]
)
if path:
self.pdf_path.set(path)
def on_extract(self):
pdf_path = self.pdf_path.get().strip()
year = self.year_var.get().strip()
course = self.course_var.get().strip()
outdir = self.outdir_var.get().strip() or "output_tables"
if not pdf_path or not os.path.exists(pdf_path):
messagebox.showerror("Erro", "Seleciona um PDF válido.")
return
if not year:
messagebox.showerror("Erro", "Preenche o Ano.")
return
if not course:
messagebox.showerror("Erro", "Preenche o Curso.")
return
try:
self.configure(cursor="watch")
self.update_idletasks()
result = find_target_table(pdf_path)
df = result["df"]
csv_path, xlsx_path = save_outputs(df, pdf_path, year, course, outdir)
messagebox.showinfo(
"OK",
"Tabela extraída com sucesso!\n\n"
f"Bloco encontrado nas páginas: {result['start_page']}{result['end_page']}\n"
f"Tabela escolhida: {result['best_table_id']} (score={result['score']})\n"
f"Dimensões: {df.shape[0]} x {df.shape[1]}\n\n"
f"CSV: {csv_path}\n"
f"XLSX: {xlsx_path}"
)
except Exception as e:
messagebox.showerror("Erro", f"Falhou.\n\nDetalhes: {e}")
finally:
self.configure(cursor="")
self.update_idletasks()
if __name__ == "__main__":
App().mainloop()