Botdaq/DAQ/Construção de Databases.py

import os
import re
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
from datetime import datetime

import pandas as pd
import pdfplumber


START_MARKER = "Planeamento e execução da formação – Respostas:"
END_MARKER   = "Ação dos formadores"


def clean_cell(x) -> str:
    if x is None:
        return ""
    x = str(x).replace("\n", " ").replace("\r", " ").strip()
    x = re.sub(r"\s+", " ", x)
    return x


def normalize_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df = df.applymap(clean_cell)
    df = df.loc[~(df.apply(lambda r: all(v == "" for v in r), axis=1))].reset_index(drop=True)
    return df


def df_text(df: pd.DataFrame) -> str:
    vals = df.astype(str).fillna("").values.flatten().tolist()
    return " ".join(vals)


def safe_filename(s: str) -> str:
    s = re.sub(r"[\\/:*?\"<>|]+", "_", s)
    s = re.sub(r"\s+", "_", s.strip())
    return s[:180] if len(s) > 180 else s


def save_outputs(df: pd.DataFrame, pdf_path: str, year: str, course: str, out_dir: str):
    os.makedirs(out_dir, exist_ok=True)
    stamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    base = f"{safe_filename(os.path.splitext(os.path.basename(pdf_path))[0])}__{safe_filename(course)}__{safe_filename(year)}__{stamp}"
    csv_path = os.path.join(out_dir, base + ".csv")
    xlsx_path = os.path.join(out_dir, base + ".xlsx")

    df.to_csv(csv_path, index=False, encoding="utf-8-sig")
    df.to_excel(xlsx_path, index=False)
    return csv_path, xlsx_path


def normalize_marker(s: str) -> str:
    s = s.replace("–", "-")
    s = re.sub(r"\s+", " ", s).strip().lower()
    return s


def find_target_table(pdf_path: str):
    start_n = normalize_marker(START_MARKER)
    end_n   = normalize_marker(END_MARKER)

    with pdfplumber.open(pdf_path) as pdf:
        start_page_idx = None
        for i, page in enumerate(pdf.pages):
            text = page.extract_text() or ""
            tnorm = normalize_marker(text)
            if start_n in tnorm:
                start_page_idx = i
                break
        if start_page_idx is None:
            raise ValueError(f"Não encontrei o início do bloco: '{START_MARKER}'")
        end_page_idx = None
        for j in range(start_page_idx, len(pdf.pages)):
            text = pdf.pages[j].extract_text() or ""
            tnorm = normalize_marker(text)
            if end_n in tnorm:
                end_page_idx = j
                break

        if end_page_idx is None:
            raise ValueError(f"Encontrei o início, mas não encontrei o fim do bloco: '{END_MARKER}'")
        candidate_tables = []
        for pno in range(start_page_idx, end_page_idx + 1):
            page = pdf.pages[pno]
            tables = page.extract_tables() or []
            page_text = normalize_marker(page.extract_text() or "")

            for ti, tbl in enumerate(tables, start=1):
                if not tbl or len(tbl) < 2:
                    continue
                df = pd.DataFrame(tbl[1:], columns=tbl[0])
                df = normalize_df(df)
                raw = df_text(df)
                raw_n = normalize_marker(raw)

                score = 0
                r, c = df.shape
                if c >= 3: score += 10
                if r >= 4: score += 10
                if pno == start_page_idx:
                    score += 15
                for kw in ["respostas", "planeamento", "execução", "formação", "formadores"]:
                    if kw in raw_n:
                        score += 2
                empties = (df == "").sum().sum()
                total = r * c
                if total and empties / total > 0.6:
                    score -= 10

                candidate_tables.append((score, pno, ti, df))

        if not candidate_tables:
            raise ValueError("Encontrei o bloco, mas não encontrei tabelas nesse intervalo.")
        candidate_tables.sort(key=lambda x: x[0], reverse=True)
        best_score, best_pno, best_ti, best_df = candidate_tables[0]

        return {
            "start_page": start_page_idx + 1,
            "end_page": end_page_idx + 1,
            "best_page": best_pno + 1,
            "best_table_id": f"p{best_pno+1}_t{best_ti}",
            "score": best_score,
            "df": best_df
        }


class App(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("Extrair Tabela do PDF (bloco entre títulos)")
        self.geometry("820x300")
        self.resizable(False, False)

        self.pdf_path = tk.StringVar(value="")
        self.year_var = tk.StringVar(value="")
        self.course_var = tk.StringVar(value="")
        self.outdir_var = tk.StringVar(value="output_tables")

        self._build_ui()

    def _build_ui(self):
        frm = ttk.Frame(self, padding=14)
        frm.pack(fill="both", expand=True)

        # PDF
        ttk.Label(frm, text="PDF:").grid(row=0, column=0, sticky="w")
        ttk.Entry(frm, textvariable=self.pdf_path, width=74).grid(row=0, column=1, sticky="w", padx=(8, 8))
        ttk.Button(frm, text="Escolher...", command=self.on_browse_pdf).grid(row=0, column=2, sticky="w")

        # Ano
        ttk.Label(frm, text="Ano:").grid(row=1, column=0, sticky="w", pady=(12, 0))
        ttk.Entry(frm, textvariable=self.year_var, width=20).grid(row=1, column=1, sticky="w", padx=(8, 0), pady=(12, 0))

        # Curso
        ttk.Label(frm, text="Curso:").grid(row=2, column=0, sticky="w", pady=(12, 0))
        ttk.Entry(frm, textvariable=self.course_var, width=50).grid(row=2, column=1, sticky="w", padx=(8, 0), pady=(12, 0))

        # Output dir
        ttk.Label(frm, text="Pasta de saída:").grid(row=3, column=0, sticky="w", pady=(12, 0))
        ttk.Entry(frm, textvariable=self.outdir_var, width=40).grid(row=3, column=1, sticky="w", padx=(8, 0), pady=(12, 0))

        # info sobre markers
        info = (
            f"Vai extrair a tabela do bloco:\n"
            f"INÍCIO: {START_MARKER}\n"
            f"FIM: {END_MARKER}"
        )
        ttk.Label(frm, text=info).grid(row=4, column=0, columnspan=3, sticky="w", pady=(14, 0))

        # Botões
        btns = ttk.Frame(frm)
        btns.grid(row=5, column=0, columnspan=3, sticky="e", pady=(18, 0))
        ttk.Button(btns, text="Extrair", command=self.on_extract).pack(side="right", padx=(8, 0))
        ttk.Button(btns, text="Sair", command=self.destroy).pack(side="right")

    def on_browse_pdf(self):
        path = filedialog.askopenfilename(
            title="Seleciona um PDF",
            filetypes=[("PDF files", "*.pdf"), ("All files", "*.*")]
        )
        if path:
            self.pdf_path.set(path)

    def on_extract(self):
        pdf_path = self.pdf_path.get().strip()
        year = self.year_var.get().strip()
        course = self.course_var.get().strip()
        outdir = self.outdir_var.get().strip() or "output_tables"

        if not pdf_path or not os.path.exists(pdf_path):
            messagebox.showerror("Erro", "Seleciona um PDF válido.")
            return
        if not year:
            messagebox.showerror("Erro", "Preenche o Ano.")
            return
        if not course:
            messagebox.showerror("Erro", "Preenche o Curso.")
            return

        try:
            self.configure(cursor="watch")
            self.update_idletasks()

            result = find_target_table(pdf_path)
            df = result["df"]

            csv_path, xlsx_path = save_outputs(df, pdf_path, year, course, outdir)

            messagebox.showinfo(
                "OK",
                "Tabela extraída com sucesso!\n\n"
                f"Bloco encontrado nas páginas: {result['start_page']} → {result['end_page']}\n"
                f"Tabela escolhida: {result['best_table_id']} (score={result['score']})\n"
                f"Dimensões: {df.shape[0]} x {df.shape[1]}\n\n"
                f"CSV: {csv_path}\n"
                f"XLSX: {xlsx_path}"
            )

        except Exception as e:
            messagebox.showerror("Erro", f"Falhou.\n\nDetalhes: {e}")
        finally:
            self.configure(cursor="")
            self.update_idletasks()


if __name__ == "__main__":
    App().mainloop()