Files
Metacase/ABANCA.ipynb
T
2026-04-19 22:34:20 +01:00

638 lines
20 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": 44,
"id": "c12daadb",
"metadata": {},
"outputs": [],
"source": [
"import fitz\n",
"import pytesseract\n",
"from PIL import Image\n",
"import io\n",
"import re\n",
"import pandas as pd\n",
"#pip install camelot-py[cv]\n",
"import camelot\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 100,
"id": "a3970a11",
"metadata": {},
"outputs": [],
"source": [
"tables = camelot.read_pdf(\"C:/Users/garci/Downloads/Extracto Sogenave_Março.pdf\", pages=\"all\",flavor=\"stream\")"
]
},
{
"cell_type": "code",
"execution_count": 101,
"id": "10067e62",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Data Descrição Data Valor Montante \\\n",
"0 01-03-2026 SALDO ANTERIOR 01-03-2026 \n",
"1 01-03-2026 55389 FECHO TPA:0001455389 061-00006 01-03-2026 1,36 H \n",
"2 01-03-2026 55294 FECHO TPA:0001455294 034-00022 01-03-2026 14,50 H \n",
"3 01-03-2026 55340 FECHO TPA:0001455340 085-00005 01-03-2026 3,67 H \n",
"4 01-03-2026 55370 FECHO TPA:0001455370 053-00003 01-03-2026 1,04 H \n",
"... ... ... ... ... \n",
"4024 31-03-2026 55356 FECHO TPA:0001455356 074-00010 31-03-2026 2,59 H \n",
"4025 31-03-2026 55231 FECHO TPA:0001455231 059-00003 31-03-2026 1,34 H \n",
"4026 31-03-2026 55224 FECHO TPA:0001455224 066-00007 31-03-2026 8,65 H \n",
"4027 31-03-2026 55234 FECHO TPA:0001455234 092-00023 31-03-2026 7,74 H \n",
"4028 31-03-2026 55312 FECHO TPA:0001455312 081-00086 31-03-2026 78,53 H \n",
"\n",
" Saldo \n",
"0 23.240,03 \n",
"1 23.241,39 \n",
"2 23.255,89 \n",
"3 23.259,56 \n",
"4 23.260,60 \n",
"... ... \n",
"4024 67.726,71 \n",
"4025 67.728,05 \n",
"4026 67.736,70 \n",
"4027 67.744,44 \n",
"4028 67.822,97 \n",
"\n",
"[4029 rows x 5 columns]\n"
]
}
],
"source": [
"linhas_finais = []\n",
"\n",
"for table in tables:\n",
" df_raw = table.df\n",
" for i in range(1, len(df_raw)):\n",
" row = df_raw.iloc[i]\n",
" if len(row) < 5:\n",
" continue\n",
"\n",
" col0 = row[0].split(\"\\n\") if row[0] else [\"\"]\n",
" col1 = row[1].split(\"\\n\") if row[1] else [\"\"]\n",
" col2 = row[2].split(\"\\n\") if row[2] else [\"\"]\n",
" col3 = row[3].split(\"\\n\") if row[3] else [\"\"]\n",
" col4 = row[4].split(\"\\n\") if row[4] else [\"\"]\n",
"\n",
" max_len = max(len(col0), len(col1), len(col2), len(col3), len(col4))\n",
"\n",
" def pad(lst, size):\n",
" return lst + [\"\"] * (size - len(lst))\n",
"\n",
" col0 = pad(col0, max_len)\n",
" col1 = pad(col1, max_len)\n",
" col2 = pad(col2, max_len)\n",
" col3 = pad(col3, max_len)\n",
" col4 = pad(col4, max_len)\n",
"\n",
" for j in range(max_len):\n",
" linhas_finais.append([\n",
" col0[j],\n",
" col1[j],\n",
" col2[j],\n",
" col3[j],\n",
" col4[j]\n",
" ])\n",
"\n",
"df_final = pd.DataFrame(\n",
" linhas_finais,\n",
" columns=[\"Data\", \"Descrição\", \"Data Valor\", \"Montante\", \"Saldo\"]\n",
")\n",
"\n",
"print(df_final)"
]
},
{
"cell_type": "code",
"execution_count": 102,
"id": "ab442a70",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Descrição</th>\n",
" <th>Data Valor</th>\n",
" <th>Montante</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>55389 FECHO TPA:0001455389 061-00006</td>\n",
" <td>01-03-2026</td>\n",
" <td>1,36 H</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>55294 FECHO TPA:0001455294 034-00022</td>\n",
" <td>01-03-2026</td>\n",
" <td>14,50 H</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>55340 FECHO TPA:0001455340 085-00005</td>\n",
" <td>01-03-2026</td>\n",
" <td>3,67 H</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>55370 FECHO TPA:0001455370 053-00003</td>\n",
" <td>01-03-2026</td>\n",
" <td>1,04 H</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>55205 FECHO TPA:0001455205 028-00009</td>\n",
" <td>01-03-2026</td>\n",
" <td>5,34 H</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4024</th>\n",
" <td>55356 FECHO TPA:0001455356 074-00010</td>\n",
" <td>31-03-2026</td>\n",
" <td>2,59 H</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4025</th>\n",
" <td>55231 FECHO TPA:0001455231 059-00003</td>\n",
" <td>31-03-2026</td>\n",
" <td>1,34 H</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4026</th>\n",
" <td>55224 FECHO TPA:0001455224 066-00007</td>\n",
" <td>31-03-2026</td>\n",
" <td>8,65 H</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4027</th>\n",
" <td>55234 FECHO TPA:0001455234 092-00023</td>\n",
" <td>31-03-2026</td>\n",
" <td>7,74 H</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4028</th>\n",
" <td>55312 FECHO TPA:0001455312 081-00086</td>\n",
" <td>31-03-2026</td>\n",
" <td>78,53 H</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4028 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" Descrição Data Valor Montante\n",
"1 55389 FECHO TPA:0001455389 061-00006 01-03-2026 1,36 H\n",
"2 55294 FECHO TPA:0001455294 034-00022 01-03-2026 14,50 H\n",
"3 55340 FECHO TPA:0001455340 085-00005 01-03-2026 3,67 H\n",
"4 55370 FECHO TPA:0001455370 053-00003 01-03-2026 1,04 H\n",
"5 55205 FECHO TPA:0001455205 028-00009 01-03-2026 5,34 H\n",
"... ... ... ...\n",
"4024 55356 FECHO TPA:0001455356 074-00010 31-03-2026 2,59 H\n",
"4025 55231 FECHO TPA:0001455231 059-00003 31-03-2026 1,34 H\n",
"4026 55224 FECHO TPA:0001455224 066-00007 31-03-2026 8,65 H\n",
"4027 55234 FECHO TPA:0001455234 092-00023 31-03-2026 7,74 H\n",
"4028 55312 FECHO TPA:0001455312 081-00086 31-03-2026 78,53 H\n",
"\n",
"[4028 rows x 3 columns]"
]
},
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_final = df_final.drop(columns=[\"Data\",\"Saldo\"])\n",
"df_final = df_final.drop(index=0)\n",
"df_final"
]
},
{
"cell_type": "code",
"execution_count": 107,
"id": "9498de76",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Descrição</th>\n",
" <th>Data Valor</th>\n",
" <th>Montante</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>536</th>\n",
" <td>Cob-Venc. LEAS:530-3001-000939-050-01/2</td>\n",
" <td>05-03-2026</td>\n",
" <td>-4898,03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>537</th>\n",
" <td>Cob-Venc. LEAS:530-3001-000941-050-01/1</td>\n",
" <td>05-03-2026</td>\n",
" <td>-6245,72</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1014</th>\n",
" <td>COMISSÃO CERTIFICADO DE SALDOS E INF. A</td>\n",
" <td>09-03-2026</td>\n",
" <td>-100,00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1015</th>\n",
" <td>IVA COMISSÃO IVA(CERTIFICADO DE SALDOS E</td>\n",
" <td>09-03-2026</td>\n",
" <td>-23,00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1925</th>\n",
" <td>Cob-Venc. LEAS:530-3001-000940-050-01/3</td>\n",
" <td>16-03-2026</td>\n",
" <td>-2216,87</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Descrição Data Valor Montante\n",
"536 Cob-Venc. LEAS:530-3001-000939-050-01/2 05-03-2026 -4898,03 \n",
"537 Cob-Venc. LEAS:530-3001-000941-050-01/1 05-03-2026 -6245,72 \n",
"1014 COMISSÃO CERTIFICADO DE SALDOS E INF. A 09-03-2026 -100,00 \n",
"1015 IVA COMISSÃO IVA(CERTIFICADO DE SALDOS E 09-03-2026 -23,00 \n",
"1925 Cob-Venc. LEAS:530-3001-000940-050-01/3 16-03-2026 -2216,87 "
]
},
"execution_count": 107,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_D = df_final[df_final[\"Montante\"].str.contains(\"D\", na=False)]\n",
"df_D[\"Montante\"]=df_D[\"Montante\"].str.replace(\"D\",\"\", regex=False)\n",
"df_D[\"Montante\"]=df_D[\"Montante\"].str.replace(\".\",\"\", regex=False)\n",
"df_D[\"Descrição\"]=df_D[\"Descrição\"].str.replace(\"ˆ\",\"Ã\", regex=False)\n",
"df_D[\"Montante\"]=\"-\" + df_D[\"Montante\"].astype(str)\n",
"df_D"
]
},
{
"cell_type": "code",
"execution_count": 108,
"id": "50b681a6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Descrição</th>\n",
" <th>Data Valor</th>\n",
" <th>Montante</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>TPA:0001455389</td>\n",
" <td>01-03-2026</td>\n",
" <td>1,36</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>TPA:0001455294</td>\n",
" <td>01-03-2026</td>\n",
" <td>14,50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>TPA:0001455340</td>\n",
" <td>01-03-2026</td>\n",
" <td>3,67</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>TPA:0001455370</td>\n",
" <td>01-03-2026</td>\n",
" <td>1,04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>TPA:0001455205</td>\n",
" <td>01-03-2026</td>\n",
" <td>5,34</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4024</th>\n",
" <td>TPA:0001455356</td>\n",
" <td>31-03-2026</td>\n",
" <td>2,59</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4025</th>\n",
" <td>TPA:0001455231</td>\n",
" <td>31-03-2026</td>\n",
" <td>1,34</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4026</th>\n",
" <td>TPA:0001455224</td>\n",
" <td>31-03-2026</td>\n",
" <td>8,65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4027</th>\n",
" <td>TPA:0001455234</td>\n",
" <td>31-03-2026</td>\n",
" <td>7,74</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4028</th>\n",
" <td>TPA:0001455312</td>\n",
" <td>31-03-2026</td>\n",
" <td>78,53</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3995 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" Descrição Data Valor Montante\n",
"1 TPA:0001455389 01-03-2026 1,36 \n",
"2 TPA:0001455294 01-03-2026 14,50 \n",
"3 TPA:0001455340 01-03-2026 3,67 \n",
"4 TPA:0001455370 01-03-2026 1,04 \n",
"5 TPA:0001455205 01-03-2026 5,34 \n",
"... ... ... ...\n",
"4024 TPA:0001455356 31-03-2026 2,59 \n",
"4025 TPA:0001455231 31-03-2026 1,34 \n",
"4026 TPA:0001455224 31-03-2026 8,65 \n",
"4027 TPA:0001455234 31-03-2026 7,74 \n",
"4028 TPA:0001455312 31-03-2026 78,53 \n",
"\n",
"[3995 rows x 3 columns]"
]
},
"execution_count": 108,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_H = df_final[df_final[\"Montante\"].str.contains(\"H\", na=False)]\n",
"df_H[\"Descrição\"] = df_H[\"Descrição\"].str.extract(r'(TPA:\\d+)')\n",
"df_H[\"Montante\"]=df_H[\"Montante\"].str.replace(\"H\",\"\", regex=False)\n",
"df_H[\"Montante\"]=df_H[\"Montante\"].str.replace(\".\",\"\", regex=False)\n",
"\n",
"df_H"
]
},
{
"cell_type": "code",
"execution_count": 112,
"id": "c752079f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Descrição</th>\n",
" <th>Data Valor</th>\n",
" <th>Montante</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Cob-Venc. LEAS:530-3001-000939-050-01/2</td>\n",
" <td>05-03-2026</td>\n",
" <td>-4898.03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Cob-Venc. LEAS:530-3001-000941-050-01/1</td>\n",
" <td>05-03-2026</td>\n",
" <td>-6245.72</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>COMISSÃO CERTIFICADO DE SALDOS E INF. A</td>\n",
" <td>09-03-2026</td>\n",
" <td>-100.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>IVA COMISSÃO IVA(CERTIFICADO DE SALDOS E</td>\n",
" <td>09-03-2026</td>\n",
" <td>-23.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Cob-Venc. LEAS:530-3001-000940-050-01/3</td>\n",
" <td>16-03-2026</td>\n",
" <td>-2216.87</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3995</th>\n",
" <td>TPA:0001455356</td>\n",
" <td>31-03-2026</td>\n",
" <td>2.59</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3996</th>\n",
" <td>TPA:0001455231</td>\n",
" <td>31-03-2026</td>\n",
" <td>1.34</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3997</th>\n",
" <td>TPA:0001455224</td>\n",
" <td>31-03-2026</td>\n",
" <td>8.65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3998</th>\n",
" <td>TPA:0001455234</td>\n",
" <td>31-03-2026</td>\n",
" <td>7.74</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3999</th>\n",
" <td>TPA:0001455312</td>\n",
" <td>31-03-2026</td>\n",
" <td>78.53</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4000 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" Descrição Data Valor Montante\n",
"0 Cob-Venc. LEAS:530-3001-000939-050-01/2 05-03-2026 -4898.03\n",
"1 Cob-Venc. LEAS:530-3001-000941-050-01/1 05-03-2026 -6245.72\n",
"2 COMISSÃO CERTIFICADO DE SALDOS E INF. A 09-03-2026 -100.00\n",
"3 IVA COMISSÃO IVA(CERTIFICADO DE SALDOS E 09-03-2026 -23.00\n",
"4 Cob-Venc. LEAS:530-3001-000940-050-01/3 16-03-2026 -2216.87\n",
"... ... ... ...\n",
"3995 TPA:0001455356 31-03-2026 2.59\n",
"3996 TPA:0001455231 31-03-2026 1.34\n",
"3997 TPA:0001455224 31-03-2026 8.65\n",
"3998 TPA:0001455234 31-03-2026 7.74\n",
"3999 TPA:0001455312 31-03-2026 78.53\n",
"\n",
"[4000 rows x 3 columns]"
]
},
"execution_count": 112,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dffinal=pd.concat([df_D, df_H], ignore_index=True)\n",
"dffinal[\"Montante\"]=dffinal[\"Montante\"].str.replace(\",\",\".\", regex=False)\n",
"dffinal[\"Montante\"] = dffinal[\"Montante\"].astype(float)\n",
"dffinal"
]
},
{
"cell_type": "code",
"execution_count": 114,
"id": "76673631",
"metadata": {},
"outputs": [],
"source": [
"dffinal.to_excel(\"C:/Users/garci/Downloads/MAR 25 BIC2.xlsx\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}