230 lines
8.0 KiB
Python
230 lines
8.0 KiB
Python
import os
|
||
import re
|
||
import tkinter as tk
|
||
from tkinter import ttk, filedialog, messagebox
|
||
from datetime import datetime
|
||
|
||
import pandas as pd
|
||
import pdfplumber
|
||
|
||
|
||
START_MARKER = "Planeamento e execução da formação – Respostas:"
|
||
END_MARKER = "Ação dos formadores"
|
||
|
||
|
||
def clean_cell(x) -> str:
|
||
if x is None:
|
||
return ""
|
||
x = str(x).replace("\n", " ").replace("\r", " ").strip()
|
||
x = re.sub(r"\s+", " ", x)
|
||
return x
|
||
|
||
|
||
def normalize_df(df: pd.DataFrame) -> pd.DataFrame:
|
||
df = df.copy()
|
||
df = df.applymap(clean_cell)
|
||
df = df.loc[~(df.apply(lambda r: all(v == "" for v in r), axis=1))].reset_index(drop=True)
|
||
return df
|
||
|
||
|
||
def df_text(df: pd.DataFrame) -> str:
|
||
vals = df.astype(str).fillna("").values.flatten().tolist()
|
||
return " ".join(vals)
|
||
|
||
|
||
def safe_filename(s: str) -> str:
|
||
s = re.sub(r"[\\/:*?\"<>|]+", "_", s)
|
||
s = re.sub(r"\s+", "_", s.strip())
|
||
return s[:180] if len(s) > 180 else s
|
||
|
||
|
||
def save_outputs(df: pd.DataFrame, pdf_path: str, year: str, course: str, out_dir: str):
|
||
os.makedirs(out_dir, exist_ok=True)
|
||
stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
|
||
base = f"{safe_filename(os.path.splitext(os.path.basename(pdf_path))[0])}__{safe_filename(course)}__{safe_filename(year)}__{stamp}"
|
||
csv_path = os.path.join(out_dir, base + ".csv")
|
||
xlsx_path = os.path.join(out_dir, base + ".xlsx")
|
||
|
||
df.to_csv(csv_path, index=False, encoding="utf-8-sig")
|
||
df.to_excel(xlsx_path, index=False)
|
||
return csv_path, xlsx_path
|
||
|
||
|
||
def normalize_marker(s: str) -> str:
|
||
s = s.replace("–", "-")
|
||
s = re.sub(r"\s+", " ", s).strip().lower()
|
||
return s
|
||
|
||
|
||
def find_target_table(pdf_path: str):
|
||
start_n = normalize_marker(START_MARKER)
|
||
end_n = normalize_marker(END_MARKER)
|
||
|
||
with pdfplumber.open(pdf_path) as pdf:
|
||
start_page_idx = None
|
||
for i, page in enumerate(pdf.pages):
|
||
text = page.extract_text() or ""
|
||
tnorm = normalize_marker(text)
|
||
if start_n in tnorm:
|
||
start_page_idx = i
|
||
break
|
||
if start_page_idx is None:
|
||
raise ValueError(f"Não encontrei o início do bloco: '{START_MARKER}'")
|
||
end_page_idx = None
|
||
for j in range(start_page_idx, len(pdf.pages)):
|
||
text = pdf.pages[j].extract_text() or ""
|
||
tnorm = normalize_marker(text)
|
||
if end_n in tnorm:
|
||
end_page_idx = j
|
||
break
|
||
|
||
if end_page_idx is None:
|
||
raise ValueError(f"Encontrei o início, mas não encontrei o fim do bloco: '{END_MARKER}'")
|
||
candidate_tables = []
|
||
for pno in range(start_page_idx, end_page_idx + 1):
|
||
page = pdf.pages[pno]
|
||
tables = page.extract_tables() or []
|
||
page_text = normalize_marker(page.extract_text() or "")
|
||
|
||
for ti, tbl in enumerate(tables, start=1):
|
||
if not tbl or len(tbl) < 2:
|
||
continue
|
||
df = pd.DataFrame(tbl[1:], columns=tbl[0])
|
||
df = normalize_df(df)
|
||
raw = df_text(df)
|
||
raw_n = normalize_marker(raw)
|
||
|
||
score = 0
|
||
r, c = df.shape
|
||
if c >= 3: score += 10
|
||
if r >= 4: score += 10
|
||
if pno == start_page_idx:
|
||
score += 15
|
||
for kw in ["respostas", "planeamento", "execução", "formação", "formadores"]:
|
||
if kw in raw_n:
|
||
score += 2
|
||
empties = (df == "").sum().sum()
|
||
total = r * c
|
||
if total and empties / total > 0.6:
|
||
score -= 10
|
||
|
||
candidate_tables.append((score, pno, ti, df))
|
||
|
||
if not candidate_tables:
|
||
raise ValueError("Encontrei o bloco, mas não encontrei tabelas nesse intervalo.")
|
||
candidate_tables.sort(key=lambda x: x[0], reverse=True)
|
||
best_score, best_pno, best_ti, best_df = candidate_tables[0]
|
||
|
||
return {
|
||
"start_page": start_page_idx + 1,
|
||
"end_page": end_page_idx + 1,
|
||
"best_page": best_pno + 1,
|
||
"best_table_id": f"p{best_pno+1}_t{best_ti}",
|
||
"score": best_score,
|
||
"df": best_df
|
||
}
|
||
|
||
|
||
class App(tk.Tk):
|
||
def __init__(self):
|
||
super().__init__()
|
||
self.title("Extrair Tabela do PDF (bloco entre títulos)")
|
||
self.geometry("820x300")
|
||
self.resizable(False, False)
|
||
|
||
self.pdf_path = tk.StringVar(value="")
|
||
self.year_var = tk.StringVar(value="")
|
||
self.course_var = tk.StringVar(value="")
|
||
self.outdir_var = tk.StringVar(value="output_tables")
|
||
|
||
self._build_ui()
|
||
|
||
def _build_ui(self):
|
||
frm = ttk.Frame(self, padding=14)
|
||
frm.pack(fill="both", expand=True)
|
||
|
||
# PDF
|
||
ttk.Label(frm, text="PDF:").grid(row=0, column=0, sticky="w")
|
||
ttk.Entry(frm, textvariable=self.pdf_path, width=74).grid(row=0, column=1, sticky="w", padx=(8, 8))
|
||
ttk.Button(frm, text="Escolher...", command=self.on_browse_pdf).grid(row=0, column=2, sticky="w")
|
||
|
||
# Ano
|
||
ttk.Label(frm, text="Ano:").grid(row=1, column=0, sticky="w", pady=(12, 0))
|
||
ttk.Entry(frm, textvariable=self.year_var, width=20).grid(row=1, column=1, sticky="w", padx=(8, 0), pady=(12, 0))
|
||
|
||
# Curso
|
||
ttk.Label(frm, text="Curso:").grid(row=2, column=0, sticky="w", pady=(12, 0))
|
||
ttk.Entry(frm, textvariable=self.course_var, width=50).grid(row=2, column=1, sticky="w", padx=(8, 0), pady=(12, 0))
|
||
|
||
# Output dir
|
||
ttk.Label(frm, text="Pasta de saída:").grid(row=3, column=0, sticky="w", pady=(12, 0))
|
||
ttk.Entry(frm, textvariable=self.outdir_var, width=40).grid(row=3, column=1, sticky="w", padx=(8, 0), pady=(12, 0))
|
||
|
||
# info sobre markers
|
||
info = (
|
||
f"Vai extrair a tabela do bloco:\n"
|
||
f"INÍCIO: {START_MARKER}\n"
|
||
f"FIM: {END_MARKER}"
|
||
)
|
||
ttk.Label(frm, text=info).grid(row=4, column=0, columnspan=3, sticky="w", pady=(14, 0))
|
||
|
||
# Botões
|
||
btns = ttk.Frame(frm)
|
||
btns.grid(row=5, column=0, columnspan=3, sticky="e", pady=(18, 0))
|
||
ttk.Button(btns, text="Extrair", command=self.on_extract).pack(side="right", padx=(8, 0))
|
||
ttk.Button(btns, text="Sair", command=self.destroy).pack(side="right")
|
||
|
||
def on_browse_pdf(self):
|
||
path = filedialog.askopenfilename(
|
||
title="Seleciona um PDF",
|
||
filetypes=[("PDF files", "*.pdf"), ("All files", "*.*")]
|
||
)
|
||
if path:
|
||
self.pdf_path.set(path)
|
||
|
||
def on_extract(self):
|
||
pdf_path = self.pdf_path.get().strip()
|
||
year = self.year_var.get().strip()
|
||
course = self.course_var.get().strip()
|
||
outdir = self.outdir_var.get().strip() or "output_tables"
|
||
|
||
if not pdf_path or not os.path.exists(pdf_path):
|
||
messagebox.showerror("Erro", "Seleciona um PDF válido.")
|
||
return
|
||
if not year:
|
||
messagebox.showerror("Erro", "Preenche o Ano.")
|
||
return
|
||
if not course:
|
||
messagebox.showerror("Erro", "Preenche o Curso.")
|
||
return
|
||
|
||
try:
|
||
self.configure(cursor="watch")
|
||
self.update_idletasks()
|
||
|
||
result = find_target_table(pdf_path)
|
||
df = result["df"]
|
||
|
||
csv_path, xlsx_path = save_outputs(df, pdf_path, year, course, outdir)
|
||
|
||
messagebox.showinfo(
|
||
"OK",
|
||
"Tabela extraída com sucesso!\n\n"
|
||
f"Bloco encontrado nas páginas: {result['start_page']} → {result['end_page']}\n"
|
||
f"Tabela escolhida: {result['best_table_id']} (score={result['score']})\n"
|
||
f"Dimensões: {df.shape[0]} x {df.shape[1]}\n\n"
|
||
f"CSV: {csv_path}\n"
|
||
f"XLSX: {xlsx_path}"
|
||
)
|
||
|
||
except Exception as e:
|
||
messagebox.showerror("Erro", f"Falhou.\n\nDetalhes: {e}")
|
||
finally:
|
||
self.configure(cursor="")
|
||
self.update_idletasks()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
App().mainloop()
|