Files
Botdaq/DAQ/Untitled-1.py
T
2026-03-14 22:57:45 +00:00

243 lines
7.0 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import re
import sys
from datetime import datetime
import tkinter as tk
from tkinter import filedialog, simpledialog, messagebox
import pandas as pd
import pdfplumber
START_MARKER = "Planeamento e execução da formação "
END_MARKER = "Ação dos formadores"
def norm_text(s: str) -> str:
if not s:
return ""
s = s.replace("–", "-")
s = re.sub(r"\s+", " ", s).strip().lower()
return s
def clean_cell(x) -> str:
if x is None:
return ""
x = str(x).replace("\n", " ").replace("\r", " ").strip()
x = re.sub(r"\s+", " ", x)
return x
def convert(x):
if isinstance(x, (int, float)):
return str(x).replace(".", ",")
if isinstance(x, str):
import re
if re.fullmatch(r"-?\d+\.\d+", x):
return x.replace(".", ",")
return x
def normalize_df(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
df = df.apply(lambda col: col.map(clean_cell))
df = df.apply(lambda col: col.map(convert))
df = df.loc[
~(df.apply(lambda r: all(v == "" for v in r), axis=1))
].reset_index(drop=True)
return df
def df_flattext(df: pd.DataFrame) -> str:
vals = df.astype(str).fillna("").values.flatten().tolist()
return " ".join(vals)
def safe_filename(s: str) -> str:
s = re.sub(r"[\\/:*?\"<>|]+", "_", s)
s = re.sub(r"\s+", "_", s.strip())
return s[:180]
def dot_to_comma(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
def convert(x):
if isinstance(x, (int, float)):
return str(x).replace(".", ",")
if isinstance(x, str):
if re.fullmatch(r"-?\d+\.\d+", x):
return x.replace(".", ",")
return x
return df.apply(lambda col: col.map(convert))
def save_row_if_new(
final_df: pd.DataFrame,
xlsx_path: str,
row_name: str,
sheet_name: str = "Resultados",
):
root = tk.Tk()
root.withdraw()
if not os.path.exists(xlsx_path):
final_df.to_excel(xlsx_path, sheet_name=sheet_name, index=True)
messagebox.showinfo("OK", f"Ficheiro criado e linha guardada:\n{row_name}\n\n{xlsx_path}")
return False
try:
existing = pd.read_excel(xlsx_path, sheet_name=sheet_name, index_col=0)
except ValueError:
final_df.to_excel(xlsx_path, sheet_name=sheet_name, index=True)
messagebox.showinfo("OK", f"Sheet '{sheet_name}' criada e linha guardada:\n{row_name}")
return False
if row_name in existing.index.astype(str):
messagebox.showwarning(
"Já existe",
f"Já existe uma linha com este nome:\n{row_name}\n\nNada foi gravado.",
)
return True
updated = pd.concat([existing, final_df], axis=0)
with pd.ExcelWriter(xlsx_path, engine="openpyxl", mode="w") as writer:
updated.to_excel(writer, sheet_name=sheet_name, index=True)
messagebox.showinfo("OK", f"Linha adicionada com sucesso:\n{row_name}\n\n{xlsx_path}")
return False
def main() -> int:
root = tk.Tk()
root.withdraw()
root.update()
pdf_path = filedialog.askopenfilename(
title="Seleciona o PDF",
filetypes=[("PDF files", "*.pdf"), ("All files", "*.*")],
)
if not pdf_path:
messagebox.showerror("Erro", "Nenhum PDF selecionado.")
return 1
pdf_path = os.path.abspath(pdf_path)
course_name = simpledialog.askstring(
title="Nome do Curso",
prompt="Introduz o nome do curso:",
)
if not course_name:
messagebox.showerror("Erro", "Nome do curso nÃo foi preenchido.")
return 1
year = simpledialog.askstring(
title="Ano",
prompt="Introduz o ano (ex: 2024):",
)
if not year or not year.isdigit():
messagebox.showerror("Erro", "Ano inválido.")
return 1
start_n = norm_text(START_MARKER)
end_n = norm_text(END_MARKER)
with pdfplumber.open(pdf_path) as pdf:
start_page = None
end_page = None
for i, page in enumerate(pdf.pages):
txt = norm_text(page.extract_text() or "")
if start_n in txt:
start_page = i
break
if start_page is None:
raise ValueError(f"Não encontrei o ini­cio: {START_MARKER}")
for j in range(start_page, len(pdf.pages)):
txt = norm_text(pdf.pages[j].extract_text() or "")
if end_n in txt:
end_page = j
break
if end_page is None:
raise ValueError(f"Encontrei o inicio, mas não encontrei o fim: {END_MARKER}")
candidates = []
with pdfplumber.open(pdf_path) as pdf:
for pno in range(start_page, end_page + 1):
page = pdf.pages[pno]
tables = page.extract_tables() or []
for ti, tbl in enumerate(tables, start=1):
if not tbl or len(tbl) < 2:
continue
header = tbl[0]
body = tbl[1:]
df = pd.DataFrame(body, columns=header)
df = normalize_df(df)
r, c = df.shape
if r < 2 or c < 2:
continue
text_n = norm_text(df_flattext(df))
score = 0
if c >= 3:
score += 10
if r >= 4:
score += 10
if pno == start_page:
score += 15
for kw in ["planeamento", "execucao", "formacao", "respostas", "formadores"]:
if kw in text_n:
score += 2
empties = (df == "").sum().sum()
total = r * c
if total and empties / total > 0.6:
score -= 10
candidates.append((score, pno, ti, df))
if not candidates:
raise ValueError("Encontrei o bloco, mas não encontrei tabelas nas páginas desse intervalo.")
candidates.sort(key=lambda x: x[0], reverse=True)
best_score, best_pno, best_ti, best_df = candidates[0]
print(f"Bloco: páginas {start_page+1} até {end_page+1}")
print(f"Tabela escolhida: p{best_pno+1}_t{best_ti} (score={best_score})")
print("Dimensões:", best_df.shape)
rows_config = {
1: [0, 2],
2: [0, 2],
3: [0, 2],
5: [0, 2],
7: [0, 2],
8: [0, 2],
9: [0, 2],
10: [0, 2],
12: [0, 3],
13: [0, 3],
14: [0, 3],
15: [0, 3],
16: [0, 3],
17: [0, 3],
}
new_rows = []
for row_idx, col_idxs in rows_config.items():
values = best_df.iloc[row_idx, col_idxs].tolist()
new_rows.append(values)
new_df = pd.DataFrame(new_rows)
data_dict = dict(zip(new_df[0], new_df[1]))
final_df = pd.DataFrame([data_dict])
final_df.index = [f"{year} - {course_name}"]
final_df = dot_to_comma(final_df)
row_name = f"{year} - {course_name}"
xlsx_path = "BDpowerbi.xlsx"
save_row_if_new(final_df, xlsx_path, row_name, sheet_name="Resultados")
return 0
if __name__ == "__main__":
sys.exit(main())