230 lines
8.0 KiB
Python
230 lines
8.0 KiB
Python
|
|
import os
|
|||
|
|
import re
|
|||
|
|
import tkinter as tk
|
|||
|
|
from tkinter import ttk, filedialog, messagebox
|
|||
|
|
from datetime import datetime
|
|||
|
|
|
|||
|
|
import pandas as pd
|
|||
|
|
import pdfplumber
|
|||
|
|
|
|||
|
|
|
|||
|
|
START_MARKER = "Planeamento e execução da formação – Respostas:"
|
|||
|
|
END_MARKER = "Ação dos formadores"
|
|||
|
|
|
|||
|
|
|
|||
|
|
def clean_cell(x) -> str:
|
|||
|
|
if x is None:
|
|||
|
|
return ""
|
|||
|
|
x = str(x).replace("\n", " ").replace("\r", " ").strip()
|
|||
|
|
x = re.sub(r"\s+", " ", x)
|
|||
|
|
return x
|
|||
|
|
|
|||
|
|
|
|||
|
|
def normalize_df(df: pd.DataFrame) -> pd.DataFrame:
|
|||
|
|
df = df.copy()
|
|||
|
|
df = df.applymap(clean_cell)
|
|||
|
|
df = df.loc[~(df.apply(lambda r: all(v == "" for v in r), axis=1))].reset_index(drop=True)
|
|||
|
|
return df
|
|||
|
|
|
|||
|
|
|
|||
|
|
def df_text(df: pd.DataFrame) -> str:
|
|||
|
|
vals = df.astype(str).fillna("").values.flatten().tolist()
|
|||
|
|
return " ".join(vals)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def safe_filename(s: str) -> str:
|
|||
|
|
s = re.sub(r"[\\/:*?\"<>|]+", "_", s)
|
|||
|
|
s = re.sub(r"\s+", "_", s.strip())
|
|||
|
|
return s[:180] if len(s) > 180 else s
|
|||
|
|
|
|||
|
|
|
|||
|
|
def save_outputs(df: pd.DataFrame, pdf_path: str, year: str, course: str, out_dir: str):
|
|||
|
|
os.makedirs(out_dir, exist_ok=True)
|
|||
|
|
stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|||
|
|
|
|||
|
|
base = f"{safe_filename(os.path.splitext(os.path.basename(pdf_path))[0])}__{safe_filename(course)}__{safe_filename(year)}__{stamp}"
|
|||
|
|
csv_path = os.path.join(out_dir, base + ".csv")
|
|||
|
|
xlsx_path = os.path.join(out_dir, base + ".xlsx")
|
|||
|
|
|
|||
|
|
df.to_csv(csv_path, index=False, encoding="utf-8-sig")
|
|||
|
|
df.to_excel(xlsx_path, index=False)
|
|||
|
|
return csv_path, xlsx_path
|
|||
|
|
|
|||
|
|
|
|||
|
|
def normalize_marker(s: str) -> str:
|
|||
|
|
s = s.replace("–", "-")
|
|||
|
|
s = re.sub(r"\s+", " ", s).strip().lower()
|
|||
|
|
return s
|
|||
|
|
|
|||
|
|
|
|||
|
|
def find_target_table(pdf_path: str):
|
|||
|
|
start_n = normalize_marker(START_MARKER)
|
|||
|
|
end_n = normalize_marker(END_MARKER)
|
|||
|
|
|
|||
|
|
with pdfplumber.open(pdf_path) as pdf:
|
|||
|
|
start_page_idx = None
|
|||
|
|
for i, page in enumerate(pdf.pages):
|
|||
|
|
text = page.extract_text() or ""
|
|||
|
|
tnorm = normalize_marker(text)
|
|||
|
|
if start_n in tnorm:
|
|||
|
|
start_page_idx = i
|
|||
|
|
break
|
|||
|
|
if start_page_idx is None:
|
|||
|
|
raise ValueError(f"Não encontrei o início do bloco: '{START_MARKER}'")
|
|||
|
|
end_page_idx = None
|
|||
|
|
for j in range(start_page_idx, len(pdf.pages)):
|
|||
|
|
text = pdf.pages[j].extract_text() or ""
|
|||
|
|
tnorm = normalize_marker(text)
|
|||
|
|
if end_n in tnorm:
|
|||
|
|
end_page_idx = j
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
if end_page_idx is None:
|
|||
|
|
raise ValueError(f"Encontrei o início, mas não encontrei o fim do bloco: '{END_MARKER}'")
|
|||
|
|
candidate_tables = []
|
|||
|
|
for pno in range(start_page_idx, end_page_idx + 1):
|
|||
|
|
page = pdf.pages[pno]
|
|||
|
|
tables = page.extract_tables() or []
|
|||
|
|
page_text = normalize_marker(page.extract_text() or "")
|
|||
|
|
|
|||
|
|
for ti, tbl in enumerate(tables, start=1):
|
|||
|
|
if not tbl or len(tbl) < 2:
|
|||
|
|
continue
|
|||
|
|
df = pd.DataFrame(tbl[1:], columns=tbl[0])
|
|||
|
|
df = normalize_df(df)
|
|||
|
|
raw = df_text(df)
|
|||
|
|
raw_n = normalize_marker(raw)
|
|||
|
|
|
|||
|
|
score = 0
|
|||
|
|
r, c = df.shape
|
|||
|
|
if c >= 3: score += 10
|
|||
|
|
if r >= 4: score += 10
|
|||
|
|
if pno == start_page_idx:
|
|||
|
|
score += 15
|
|||
|
|
for kw in ["respostas", "planeamento", "execução", "formação", "formadores"]:
|
|||
|
|
if kw in raw_n:
|
|||
|
|
score += 2
|
|||
|
|
empties = (df == "").sum().sum()
|
|||
|
|
total = r * c
|
|||
|
|
if total and empties / total > 0.6:
|
|||
|
|
score -= 10
|
|||
|
|
|
|||
|
|
candidate_tables.append((score, pno, ti, df))
|
|||
|
|
|
|||
|
|
if not candidate_tables:
|
|||
|
|
raise ValueError("Encontrei o bloco, mas não encontrei tabelas nesse intervalo.")
|
|||
|
|
candidate_tables.sort(key=lambda x: x[0], reverse=True)
|
|||
|
|
best_score, best_pno, best_ti, best_df = candidate_tables[0]
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"start_page": start_page_idx + 1,
|
|||
|
|
"end_page": end_page_idx + 1,
|
|||
|
|
"best_page": best_pno + 1,
|
|||
|
|
"best_table_id": f"p{best_pno+1}_t{best_ti}",
|
|||
|
|
"score": best_score,
|
|||
|
|
"df": best_df
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
class App(tk.Tk):
|
|||
|
|
def __init__(self):
|
|||
|
|
super().__init__()
|
|||
|
|
self.title("Extrair Tabela do PDF (bloco entre títulos)")
|
|||
|
|
self.geometry("820x300")
|
|||
|
|
self.resizable(False, False)
|
|||
|
|
|
|||
|
|
self.pdf_path = tk.StringVar(value="")
|
|||
|
|
self.year_var = tk.StringVar(value="")
|
|||
|
|
self.course_var = tk.StringVar(value="")
|
|||
|
|
self.outdir_var = tk.StringVar(value="output_tables")
|
|||
|
|
|
|||
|
|
self._build_ui()
|
|||
|
|
|
|||
|
|
def _build_ui(self):
|
|||
|
|
frm = ttk.Frame(self, padding=14)
|
|||
|
|
frm.pack(fill="both", expand=True)
|
|||
|
|
|
|||
|
|
# PDF
|
|||
|
|
ttk.Label(frm, text="PDF:").grid(row=0, column=0, sticky="w")
|
|||
|
|
ttk.Entry(frm, textvariable=self.pdf_path, width=74).grid(row=0, column=1, sticky="w", padx=(8, 8))
|
|||
|
|
ttk.Button(frm, text="Escolher...", command=self.on_browse_pdf).grid(row=0, column=2, sticky="w")
|
|||
|
|
|
|||
|
|
# Ano
|
|||
|
|
ttk.Label(frm, text="Ano:").grid(row=1, column=0, sticky="w", pady=(12, 0))
|
|||
|
|
ttk.Entry(frm, textvariable=self.year_var, width=20).grid(row=1, column=1, sticky="w", padx=(8, 0), pady=(12, 0))
|
|||
|
|
|
|||
|
|
# Curso
|
|||
|
|
ttk.Label(frm, text="Curso:").grid(row=2, column=0, sticky="w", pady=(12, 0))
|
|||
|
|
ttk.Entry(frm, textvariable=self.course_var, width=50).grid(row=2, column=1, sticky="w", padx=(8, 0), pady=(12, 0))
|
|||
|
|
|
|||
|
|
# Output dir
|
|||
|
|
ttk.Label(frm, text="Pasta de saída:").grid(row=3, column=0, sticky="w", pady=(12, 0))
|
|||
|
|
ttk.Entry(frm, textvariable=self.outdir_var, width=40).grid(row=3, column=1, sticky="w", padx=(8, 0), pady=(12, 0))
|
|||
|
|
|
|||
|
|
# info sobre markers
|
|||
|
|
info = (
|
|||
|
|
f"Vai extrair a tabela do bloco:\n"
|
|||
|
|
f"INÍCIO: {START_MARKER}\n"
|
|||
|
|
f"FIM: {END_MARKER}"
|
|||
|
|
)
|
|||
|
|
ttk.Label(frm, text=info).grid(row=4, column=0, columnspan=3, sticky="w", pady=(14, 0))
|
|||
|
|
|
|||
|
|
# Botões
|
|||
|
|
btns = ttk.Frame(frm)
|
|||
|
|
btns.grid(row=5, column=0, columnspan=3, sticky="e", pady=(18, 0))
|
|||
|
|
ttk.Button(btns, text="Extrair", command=self.on_extract).pack(side="right", padx=(8, 0))
|
|||
|
|
ttk.Button(btns, text="Sair", command=self.destroy).pack(side="right")
|
|||
|
|
|
|||
|
|
def on_browse_pdf(self):
|
|||
|
|
path = filedialog.askopenfilename(
|
|||
|
|
title="Seleciona um PDF",
|
|||
|
|
filetypes=[("PDF files", "*.pdf"), ("All files", "*.*")]
|
|||
|
|
)
|
|||
|
|
if path:
|
|||
|
|
self.pdf_path.set(path)
|
|||
|
|
|
|||
|
|
def on_extract(self):
|
|||
|
|
pdf_path = self.pdf_path.get().strip()
|
|||
|
|
year = self.year_var.get().strip()
|
|||
|
|
course = self.course_var.get().strip()
|
|||
|
|
outdir = self.outdir_var.get().strip() or "output_tables"
|
|||
|
|
|
|||
|
|
if not pdf_path or not os.path.exists(pdf_path):
|
|||
|
|
messagebox.showerror("Erro", "Seleciona um PDF válido.")
|
|||
|
|
return
|
|||
|
|
if not year:
|
|||
|
|
messagebox.showerror("Erro", "Preenche o Ano.")
|
|||
|
|
return
|
|||
|
|
if not course:
|
|||
|
|
messagebox.showerror("Erro", "Preenche o Curso.")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
self.configure(cursor="watch")
|
|||
|
|
self.update_idletasks()
|
|||
|
|
|
|||
|
|
result = find_target_table(pdf_path)
|
|||
|
|
df = result["df"]
|
|||
|
|
|
|||
|
|
csv_path, xlsx_path = save_outputs(df, pdf_path, year, course, outdir)
|
|||
|
|
|
|||
|
|
messagebox.showinfo(
|
|||
|
|
"OK",
|
|||
|
|
"Tabela extraída com sucesso!\n\n"
|
|||
|
|
f"Bloco encontrado nas páginas: {result['start_page']} → {result['end_page']}\n"
|
|||
|
|
f"Tabela escolhida: {result['best_table_id']} (score={result['score']})\n"
|
|||
|
|
f"Dimensões: {df.shape[0]} x {df.shape[1]}\n\n"
|
|||
|
|
f"CSV: {csv_path}\n"
|
|||
|
|
f"XLSX: {xlsx_path}"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
messagebox.showerror("Erro", f"Falhou.\n\nDetalhes: {e}")
|
|||
|
|
finally:
|
|||
|
|
self.configure(cursor="")
|
|||
|
|
self.update_idletasks()
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
App().mainloop()
|