import os import re import sys from datetime import datetime import tkinter as tk from tkinter import filedialog, simpledialog, messagebox import pandas as pd import pdfplumber START_MARKER = "Planeamento e execução da formação – " END_MARKER = "Ação dos formadores" def norm_text(s: str) -> str: if not s: return "" s = s.replace("–", "-") s = re.sub(r"\s+", " ", s).strip().lower() return s def clean_cell(x) -> str: if x is None: return "" x = str(x).replace("\n", " ").replace("\r", " ").strip() x = re.sub(r"\s+", " ", x) return x def convert(x): if isinstance(x, (int, float)): return str(x).replace(".", ",") if isinstance(x, str): import re if re.fullmatch(r"-?\d+\.\d+", x): return x.replace(".", ",") return x def normalize_df(df: pd.DataFrame) -> pd.DataFrame: df = df.copy() df = df.apply(lambda col: col.map(clean_cell)) df = df.apply(lambda col: col.map(convert)) df = df.loc[ ~(df.apply(lambda r: all(v == "" for v in r), axis=1)) ].reset_index(drop=True) return df def df_flattext(df: pd.DataFrame) -> str: vals = df.astype(str).fillna("").values.flatten().tolist() return " ".join(vals) def safe_filename(s: str) -> str: s = re.sub(r"[\\/:*?\"<>|]+", "_", s) s = re.sub(r"\s+", "_", s.strip()) return s[:180] def dot_to_comma(df: pd.DataFrame) -> pd.DataFrame: df = df.copy() def convert(x): if isinstance(x, (int, float)): return str(x).replace(".", ",") if isinstance(x, str): if re.fullmatch(r"-?\d+\.\d+", x): return x.replace(".", ",") return x return df.apply(lambda col: col.map(convert)) def save_row_if_new( final_df: pd.DataFrame, xlsx_path: str, row_name: str, sheet_name: str = "Resultados", ): root = tk.Tk() root.withdraw() if not os.path.exists(xlsx_path): final_df.to_excel(xlsx_path, sheet_name=sheet_name, index=True) messagebox.showinfo("OK", f"Ficheiro criado e linha guardada:\n{row_name}\n\n{xlsx_path}") return False try: existing = pd.read_excel(xlsx_path, sheet_name=sheet_name, index_col=0) except ValueError: final_df.to_excel(xlsx_path, sheet_name=sheet_name, index=True) messagebox.showinfo("OK", f"Sheet '{sheet_name}' criada e linha guardada:\n{row_name}") return False if row_name in existing.index.astype(str): messagebox.showwarning( "Já existe", f"Já existe uma linha com este nome:\n{row_name}\n\nNada foi gravado.", ) return True updated = pd.concat([existing, final_df], axis=0) with pd.ExcelWriter(xlsx_path, engine="openpyxl", mode="w") as writer: updated.to_excel(writer, sheet_name=sheet_name, index=True) messagebox.showinfo("OK", f"Linha adicionada com sucesso:\n{row_name}\n\n{xlsx_path}") return False def main() -> int: root = tk.Tk() root.withdraw() root.update() pdf_path = filedialog.askopenfilename( title="Seleciona o PDF", filetypes=[("PDF files", "*.pdf"), ("All files", "*.*")], ) if not pdf_path: messagebox.showerror("Erro", "Nenhum PDF selecionado.") return 1 pdf_path = os.path.abspath(pdf_path) course_name = simpledialog.askstring( title="Nome do Curso", prompt="Introduz o nome do curso:", ) if not course_name: messagebox.showerror("Erro", "Nome do curso nÃo foi preenchido.") return 1 year = simpledialog.askstring( title="Ano", prompt="Introduz o ano (ex: 2024):", ) if not year or not year.isdigit(): messagebox.showerror("Erro", "Ano inválido.") return 1 start_n = norm_text(START_MARKER) end_n = norm_text(END_MARKER) with pdfplumber.open(pdf_path) as pdf: start_page = None end_page = None for i, page in enumerate(pdf.pages): txt = norm_text(page.extract_text() or "") if start_n in txt: start_page = i break if start_page is None: raise ValueError(f"Não encontrei o ini­cio: {START_MARKER}") for j in range(start_page, len(pdf.pages)): txt = norm_text(pdf.pages[j].extract_text() or "") if end_n in txt: end_page = j break if end_page is None: raise ValueError(f"Encontrei o inicio, mas não encontrei o fim: {END_MARKER}") candidates = [] with pdfplumber.open(pdf_path) as pdf: for pno in range(start_page, end_page + 1): page = pdf.pages[pno] tables = page.extract_tables() or [] for ti, tbl in enumerate(tables, start=1): if not tbl or len(tbl) < 2: continue header = tbl[0] body = tbl[1:] df = pd.DataFrame(body, columns=header) df = normalize_df(df) r, c = df.shape if r < 2 or c < 2: continue text_n = norm_text(df_flattext(df)) score = 0 if c >= 3: score += 10 if r >= 4: score += 10 if pno == start_page: score += 15 for kw in ["planeamento", "execucao", "formacao", "respostas", "formadores"]: if kw in text_n: score += 2 empties = (df == "").sum().sum() total = r * c if total and empties / total > 0.6: score -= 10 candidates.append((score, pno, ti, df)) if not candidates: raise ValueError("Encontrei o bloco, mas não encontrei tabelas nas páginas desse intervalo.") candidates.sort(key=lambda x: x[0], reverse=True) best_score, best_pno, best_ti, best_df = candidates[0] print(f"Bloco: páginas {start_page+1} até {end_page+1}") print(f"Tabela escolhida: p{best_pno+1}_t{best_ti} (score={best_score})") print("Dimensões:", best_df.shape) rows_config = { 1: [0, 2], 2: [0, 2], 3: [0, 2], 5: [0, 2], 7: [0, 2], 8: [0, 2], 9: [0, 2], 10: [0, 2], 12: [0, 3], 13: [0, 3], 14: [0, 3], 15: [0, 3], 16: [0, 3], 17: [0, 3], } new_rows = [] for row_idx, col_idxs in rows_config.items(): values = best_df.iloc[row_idx, col_idxs].tolist() new_rows.append(values) new_df = pd.DataFrame(new_rows) data_dict = dict(zip(new_df[0], new_df[1])) final_df = pd.DataFrame([data_dict]) final_df.index = [f"{year} - {course_name}"] final_df = dot_to_comma(final_df) row_name = f"{year} - {course_name}" xlsx_path = "BDpowerbi.xlsx" save_row_if_new(final_df, xlsx_path, row_name, sheet_name="Resultados") return 0 if __name__ == "__main__": sys.exit(main())