243 lines
7.0 KiB
Python
243 lines
7.0 KiB
Python
import os
|
||
import re
|
||
import sys
|
||
from datetime import datetime
|
||
import tkinter as tk
|
||
from tkinter import filedialog, simpledialog, messagebox
|
||
|
||
import pandas as pd
|
||
import pdfplumber
|
||
|
||
|
||
START_MARKER = "Planeamento e execução da formação – "
|
||
END_MARKER = "Ação dos formadores"
|
||
|
||
|
||
def norm_text(s: str) -> str:
|
||
if not s:
|
||
return ""
|
||
s = s.replace("–", "-")
|
||
s = re.sub(r"\s+", " ", s).strip().lower()
|
||
return s
|
||
|
||
|
||
def clean_cell(x) -> str:
|
||
if x is None:
|
||
return ""
|
||
x = str(x).replace("\n", " ").replace("\r", " ").strip()
|
||
x = re.sub(r"\s+", " ", x)
|
||
return x
|
||
|
||
def convert(x):
|
||
if isinstance(x, (int, float)):
|
||
return str(x).replace(".", ",")
|
||
if isinstance(x, str):
|
||
import re
|
||
if re.fullmatch(r"-?\d+\.\d+", x):
|
||
return x.replace(".", ",")
|
||
return x
|
||
|
||
def normalize_df(df: pd.DataFrame) -> pd.DataFrame:
|
||
df = df.copy()
|
||
df = df.apply(lambda col: col.map(clean_cell))
|
||
df = df.apply(lambda col: col.map(convert))
|
||
df = df.loc[
|
||
~(df.apply(lambda r: all(v == "" for v in r), axis=1))
|
||
].reset_index(drop=True)
|
||
|
||
return df
|
||
|
||
|
||
|
||
def df_flattext(df: pd.DataFrame) -> str:
|
||
vals = df.astype(str).fillna("").values.flatten().tolist()
|
||
return " ".join(vals)
|
||
|
||
|
||
def safe_filename(s: str) -> str:
|
||
s = re.sub(r"[\\/:*?\"<>|]+", "_", s)
|
||
s = re.sub(r"\s+", "_", s.strip())
|
||
return s[:180]
|
||
|
||
|
||
def dot_to_comma(df: pd.DataFrame) -> pd.DataFrame:
|
||
df = df.copy()
|
||
|
||
def convert(x):
|
||
if isinstance(x, (int, float)):
|
||
return str(x).replace(".", ",")
|
||
if isinstance(x, str):
|
||
if re.fullmatch(r"-?\d+\.\d+", x):
|
||
return x.replace(".", ",")
|
||
return x
|
||
|
||
return df.apply(lambda col: col.map(convert))
|
||
|
||
|
||
def save_row_if_new(
|
||
final_df: pd.DataFrame,
|
||
xlsx_path: str,
|
||
row_name: str,
|
||
sheet_name: str = "Resultados",
|
||
):
|
||
root = tk.Tk()
|
||
root.withdraw()
|
||
if not os.path.exists(xlsx_path):
|
||
final_df.to_excel(xlsx_path, sheet_name=sheet_name, index=True)
|
||
messagebox.showinfo("OK", f"Ficheiro criado e linha guardada:\n{row_name}\n\n{xlsx_path}")
|
||
return False
|
||
try:
|
||
existing = pd.read_excel(xlsx_path, sheet_name=sheet_name, index_col=0)
|
||
except ValueError:
|
||
final_df.to_excel(xlsx_path, sheet_name=sheet_name, index=True)
|
||
messagebox.showinfo("OK", f"Sheet '{sheet_name}' criada e linha guardada:\n{row_name}")
|
||
return False
|
||
if row_name in existing.index.astype(str):
|
||
messagebox.showwarning(
|
||
"Já existe",
|
||
f"Já existe uma linha com este nome:\n{row_name}\n\nNada foi gravado.",
|
||
)
|
||
return True
|
||
updated = pd.concat([existing, final_df], axis=0)
|
||
with pd.ExcelWriter(xlsx_path, engine="openpyxl", mode="w") as writer:
|
||
updated.to_excel(writer, sheet_name=sheet_name, index=True)
|
||
messagebox.showinfo("OK", f"Linha adicionada com sucesso:\n{row_name}\n\n{xlsx_path}")
|
||
return False
|
||
|
||
|
||
def main() -> int:
|
||
|
||
root = tk.Tk()
|
||
root.withdraw()
|
||
root.update()
|
||
|
||
pdf_path = filedialog.askopenfilename(
|
||
title="Seleciona o PDF",
|
||
filetypes=[("PDF files", "*.pdf"), ("All files", "*.*")],
|
||
)
|
||
if not pdf_path:
|
||
messagebox.showerror("Erro", "Nenhum PDF selecionado.")
|
||
return 1
|
||
|
||
pdf_path = os.path.abspath(pdf_path)
|
||
|
||
course_name = simpledialog.askstring(
|
||
title="Nome do Curso",
|
||
prompt="Introduz o nome do curso:",
|
||
)
|
||
if not course_name:
|
||
messagebox.showerror("Erro", "Nome do curso nÃo foi preenchido.")
|
||
return 1
|
||
|
||
year = simpledialog.askstring(
|
||
title="Ano",
|
||
prompt="Introduz o ano (ex: 2024):",
|
||
)
|
||
if not year or not year.isdigit():
|
||
messagebox.showerror("Erro", "Ano inválido.")
|
||
return 1
|
||
|
||
start_n = norm_text(START_MARKER)
|
||
end_n = norm_text(END_MARKER)
|
||
with pdfplumber.open(pdf_path) as pdf:
|
||
start_page = None
|
||
end_page = None
|
||
for i, page in enumerate(pdf.pages):
|
||
txt = norm_text(page.extract_text() or "")
|
||
if start_n in txt:
|
||
start_page = i
|
||
break
|
||
if start_page is None:
|
||
raise ValueError(f"Não encontrei o inicio: {START_MARKER}")
|
||
for j in range(start_page, len(pdf.pages)):
|
||
txt = norm_text(pdf.pages[j].extract_text() or "")
|
||
if end_n in txt:
|
||
end_page = j
|
||
break
|
||
if end_page is None:
|
||
raise ValueError(f"Encontrei o inicio, mas não encontrei o fim: {END_MARKER}")
|
||
|
||
candidates = []
|
||
with pdfplumber.open(pdf_path) as pdf:
|
||
for pno in range(start_page, end_page + 1):
|
||
page = pdf.pages[pno]
|
||
tables = page.extract_tables() or []
|
||
|
||
for ti, tbl in enumerate(tables, start=1):
|
||
if not tbl or len(tbl) < 2:
|
||
continue
|
||
header = tbl[0]
|
||
body = tbl[1:]
|
||
df = pd.DataFrame(body, columns=header)
|
||
df = normalize_df(df)
|
||
|
||
r, c = df.shape
|
||
if r < 2 or c < 2:
|
||
continue
|
||
text_n = norm_text(df_flattext(df))
|
||
score = 0
|
||
if c >= 3:
|
||
score += 10
|
||
if r >= 4:
|
||
score += 10
|
||
if pno == start_page:
|
||
score += 15
|
||
for kw in ["planeamento", "execucao", "formacao", "respostas", "formadores"]:
|
||
if kw in text_n:
|
||
score += 2
|
||
|
||
empties = (df == "").sum().sum()
|
||
total = r * c
|
||
if total and empties / total > 0.6:
|
||
score -= 10
|
||
|
||
candidates.append((score, pno, ti, df))
|
||
|
||
if not candidates:
|
||
raise ValueError("Encontrei o bloco, mas não encontrei tabelas nas páginas desse intervalo.")
|
||
|
||
candidates.sort(key=lambda x: x[0], reverse=True)
|
||
best_score, best_pno, best_ti, best_df = candidates[0]
|
||
|
||
print(f"Bloco: páginas {start_page+1} até {end_page+1}")
|
||
print(f"Tabela escolhida: p{best_pno+1}_t{best_ti} (score={best_score})")
|
||
print("Dimensões:", best_df.shape)
|
||
|
||
rows_config = {
|
||
1: [0, 2],
|
||
2: [0, 2],
|
||
3: [0, 2],
|
||
5: [0, 2],
|
||
7: [0, 2],
|
||
8: [0, 2],
|
||
9: [0, 2],
|
||
10: [0, 2],
|
||
12: [0, 3],
|
||
13: [0, 3],
|
||
14: [0, 3],
|
||
15: [0, 3],
|
||
16: [0, 3],
|
||
17: [0, 3],
|
||
}
|
||
|
||
new_rows = []
|
||
for row_idx, col_idxs in rows_config.items():
|
||
values = best_df.iloc[row_idx, col_idxs].tolist()
|
||
new_rows.append(values)
|
||
|
||
new_df = pd.DataFrame(new_rows)
|
||
|
||
data_dict = dict(zip(new_df[0], new_df[1]))
|
||
final_df = pd.DataFrame([data_dict])
|
||
final_df.index = [f"{year} - {course_name}"]
|
||
final_df = dot_to_comma(final_df)
|
||
|
||
row_name = f"{year} - {course_name}"
|
||
xlsx_path = "BDpowerbi.xlsx"
|
||
save_row_if_new(final_df, xlsx_path, row_name, sheet_name="Resultados")
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main())
|