Files
Botdaq/DAQ/Construção de Databases.py
2026-03-14 22:57:45 +00:00

230 lines
8.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import re
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
from datetime import datetime
import pandas as pd
import pdfplumber
START_MARKER = "Planeamento e execução da formação Respostas:"
END_MARKER = "Ação dos formadores"
def clean_cell(x) -> str:
if x is None:
return ""
x = str(x).replace("\n", " ").replace("\r", " ").strip()
x = re.sub(r"\s+", " ", x)
return x
def normalize_df(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
df = df.applymap(clean_cell)
df = df.loc[~(df.apply(lambda r: all(v == "" for v in r), axis=1))].reset_index(drop=True)
return df
def df_text(df: pd.DataFrame) -> str:
vals = df.astype(str).fillna("").values.flatten().tolist()
return " ".join(vals)
def safe_filename(s: str) -> str:
s = re.sub(r"[\\/:*?\"<>|]+", "_", s)
s = re.sub(r"\s+", "_", s.strip())
return s[:180] if len(s) > 180 else s
def save_outputs(df: pd.DataFrame, pdf_path: str, year: str, course: str, out_dir: str):
os.makedirs(out_dir, exist_ok=True)
stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
base = f"{safe_filename(os.path.splitext(os.path.basename(pdf_path))[0])}__{safe_filename(course)}__{safe_filename(year)}__{stamp}"
csv_path = os.path.join(out_dir, base + ".csv")
xlsx_path = os.path.join(out_dir, base + ".xlsx")
df.to_csv(csv_path, index=False, encoding="utf-8-sig")
df.to_excel(xlsx_path, index=False)
return csv_path, xlsx_path
def normalize_marker(s: str) -> str:
s = s.replace("", "-")
s = re.sub(r"\s+", " ", s).strip().lower()
return s
def find_target_table(pdf_path: str):
start_n = normalize_marker(START_MARKER)
end_n = normalize_marker(END_MARKER)
with pdfplumber.open(pdf_path) as pdf:
start_page_idx = None
for i, page in enumerate(pdf.pages):
text = page.extract_text() or ""
tnorm = normalize_marker(text)
if start_n in tnorm:
start_page_idx = i
break
if start_page_idx is None:
raise ValueError(f"Não encontrei o início do bloco: '{START_MARKER}'")
end_page_idx = None
for j in range(start_page_idx, len(pdf.pages)):
text = pdf.pages[j].extract_text() or ""
tnorm = normalize_marker(text)
if end_n in tnorm:
end_page_idx = j
break
if end_page_idx is None:
raise ValueError(f"Encontrei o início, mas não encontrei o fim do bloco: '{END_MARKER}'")
candidate_tables = []
for pno in range(start_page_idx, end_page_idx + 1):
page = pdf.pages[pno]
tables = page.extract_tables() or []
page_text = normalize_marker(page.extract_text() or "")
for ti, tbl in enumerate(tables, start=1):
if not tbl or len(tbl) < 2:
continue
df = pd.DataFrame(tbl[1:], columns=tbl[0])
df = normalize_df(df)
raw = df_text(df)
raw_n = normalize_marker(raw)
score = 0
r, c = df.shape
if c >= 3: score += 10
if r >= 4: score += 10
if pno == start_page_idx:
score += 15
for kw in ["respostas", "planeamento", "execução", "formação", "formadores"]:
if kw in raw_n:
score += 2
empties = (df == "").sum().sum()
total = r * c
if total and empties / total > 0.6:
score -= 10
candidate_tables.append((score, pno, ti, df))
if not candidate_tables:
raise ValueError("Encontrei o bloco, mas não encontrei tabelas nesse intervalo.")
candidate_tables.sort(key=lambda x: x[0], reverse=True)
best_score, best_pno, best_ti, best_df = candidate_tables[0]
return {
"start_page": start_page_idx + 1,
"end_page": end_page_idx + 1,
"best_page": best_pno + 1,
"best_table_id": f"p{best_pno+1}_t{best_ti}",
"score": best_score,
"df": best_df
}
class App(tk.Tk):
def __init__(self):
super().__init__()
self.title("Extrair Tabela do PDF (bloco entre títulos)")
self.geometry("820x300")
self.resizable(False, False)
self.pdf_path = tk.StringVar(value="")
self.year_var = tk.StringVar(value="")
self.course_var = tk.StringVar(value="")
self.outdir_var = tk.StringVar(value="output_tables")
self._build_ui()
def _build_ui(self):
frm = ttk.Frame(self, padding=14)
frm.pack(fill="both", expand=True)
# PDF
ttk.Label(frm, text="PDF:").grid(row=0, column=0, sticky="w")
ttk.Entry(frm, textvariable=self.pdf_path, width=74).grid(row=0, column=1, sticky="w", padx=(8, 8))
ttk.Button(frm, text="Escolher...", command=self.on_browse_pdf).grid(row=0, column=2, sticky="w")
# Ano
ttk.Label(frm, text="Ano:").grid(row=1, column=0, sticky="w", pady=(12, 0))
ttk.Entry(frm, textvariable=self.year_var, width=20).grid(row=1, column=1, sticky="w", padx=(8, 0), pady=(12, 0))
# Curso
ttk.Label(frm, text="Curso:").grid(row=2, column=0, sticky="w", pady=(12, 0))
ttk.Entry(frm, textvariable=self.course_var, width=50).grid(row=2, column=1, sticky="w", padx=(8, 0), pady=(12, 0))
# Output dir
ttk.Label(frm, text="Pasta de saída:").grid(row=3, column=0, sticky="w", pady=(12, 0))
ttk.Entry(frm, textvariable=self.outdir_var, width=40).grid(row=3, column=1, sticky="w", padx=(8, 0), pady=(12, 0))
# info sobre markers
info = (
f"Vai extrair a tabela do bloco:\n"
f"INÍCIO: {START_MARKER}\n"
f"FIM: {END_MARKER}"
)
ttk.Label(frm, text=info).grid(row=4, column=0, columnspan=3, sticky="w", pady=(14, 0))
# Botões
btns = ttk.Frame(frm)
btns.grid(row=5, column=0, columnspan=3, sticky="e", pady=(18, 0))
ttk.Button(btns, text="Extrair", command=self.on_extract).pack(side="right", padx=(8, 0))
ttk.Button(btns, text="Sair", command=self.destroy).pack(side="right")
def on_browse_pdf(self):
path = filedialog.askopenfilename(
title="Seleciona um PDF",
filetypes=[("PDF files", "*.pdf"), ("All files", "*.*")]
)
if path:
self.pdf_path.set(path)
def on_extract(self):
pdf_path = self.pdf_path.get().strip()
year = self.year_var.get().strip()
course = self.course_var.get().strip()
outdir = self.outdir_var.get().strip() or "output_tables"
if not pdf_path or not os.path.exists(pdf_path):
messagebox.showerror("Erro", "Seleciona um PDF válido.")
return
if not year:
messagebox.showerror("Erro", "Preenche o Ano.")
return
if not course:
messagebox.showerror("Erro", "Preenche o Curso.")
return
try:
self.configure(cursor="watch")
self.update_idletasks()
result = find_target_table(pdf_path)
df = result["df"]
csv_path, xlsx_path = save_outputs(df, pdf_path, year, course, outdir)
messagebox.showinfo(
"OK",
"Tabela extraída com sucesso!\n\n"
f"Bloco encontrado nas páginas: {result['start_page']}{result['end_page']}\n"
f"Tabela escolhida: {result['best_table_id']} (score={result['score']})\n"
f"Dimensões: {df.shape[0]} x {df.shape[1]}\n\n"
f"CSV: {csv_path}\n"
f"XLSX: {xlsx_path}"
)
except Exception as e:
messagebox.showerror("Erro", f"Falhou.\n\nDetalhes: {e}")
finally:
self.configure(cursor="")
self.update_idletasks()
if __name__ == "__main__":
App().mainloop()