Skip to content

Commit d4d6eef

Browse files
authored
Merge pull request #185 from Lemoncode/feature/mino-scraping
Feature/mino scraping
2 parents cd692f5 + d62ad57 commit d4d6eef

19 files changed

Lines changed: 373 additions & 37 deletions

File tree

front/next-env.d.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/// <reference types="next" />
22
/// <reference types="next/image-types/global" />
3-
import "./.next/dev/types/routes.d.ts";
3+
import "./.next/types/routes.d.ts";
44

55
// NOTE: This file should not be edited
66
// see https://nextjs.org/docs/app/api-reference/config/typescript for more information.

functions/src/functions/scraping-functions.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ export async function scrapingsFunction(
3636

3737
const responseCuencaSegura = await embalsesRepository.actualizarCuencaSegura();
3838

39+
const responseCuencaMinoSil = await embalsesRepository.actualizarCuencaMinoSil();
40+
3941
if (responseCuencaMediterranea) {
4042
context.log(
4143
"scrapings-function: Se han actualizado los embalses de la cuenca Mediterránea",
@@ -92,6 +94,14 @@ export async function scrapingsFunction(
9294
);
9395
}
9496

97+
if (responseCuencaMinoSil) {
98+
context.log(`Se han actualizado los embalses de la cuenca Mino Sil`);
99+
} else {
100+
context.log(
101+
"No se han podido actualizar los embalses de la cuenca Mino Sil"
102+
);
103+
}
104+
95105
} catch (error) {
96106
context.error("scrapings-function: ERROR", error);
97107
throw error;

integrations/scraping-cuenca-mino-sil/package.json

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,20 @@
44
"private": true,
55
"type": "module",
66
"exports": {
7-
".": "./src/index.ts"
7+
".": "./dist/index.js"
88
},
9+
"main": "./dist/index.js",
10+
"types": "./dist/index.d.ts",
911
"scripts": {
10-
"start": "tsx --watch src/console-runner.ts"
12+
"start": "tsx --watch src/console-runner.ts",
13+
"build": "run-p clean type-check build:scraping-cuenca-mino-sil",
14+
"build:scraping-cuenca-mino-sil": "tsc",
15+
"clean": "rimraf dist",
16+
"type-check": "tsc --noEmit --preserveWatchOutput"
1117
},
1218
"dependencies": {
19+
"axios": "^1.7.0",
20+
"cheerio": "^1.1.2",
1321
"db-model": "^1.0.0"
1422
}
1523
}
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import axios from "axios";
2+
import https from "https";
3+
4+
const httpsAgent = new https.Agent({
5+
rejectUnauthorized: false,
6+
});
7+
8+
const browserHeaders = {
9+
"User-Agent":
10+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
11+
Accept:
12+
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
13+
"Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
14+
};
15+
16+
const BASE_URL =
17+
"https://saih.chminosil.es/index.php?url=/datos/mapas/mapa:H1/area:HID/acc:1";
18+
const TARGET_URL =
19+
"https://saih.chminosil.es/index.php?url=/datos/situacionEmbalses";
20+
21+
export async function getCuencaPageHTMLContent(): Promise<string> {
22+
const sessionResponse = await axios.get(BASE_URL, {
23+
httpsAgent,
24+
maxRedirects: 0,
25+
headers: browserHeaders,
26+
validateStatus: (status) => status >= 200 && status < 400,
27+
});
28+
29+
const setCookieHeaders = sessionResponse.headers["set-cookie"];
30+
const cookieString = setCookieHeaders
31+
? setCookieHeaders
32+
.map((cookie: string) => cookie.split(";")[0])
33+
.join("; ")
34+
: "";
35+
36+
const { data: html } = await axios.get(TARGET_URL, {
37+
httpsAgent,
38+
maxRedirects: 10,
39+
headers: {
40+
...browserHeaders,
41+
Cookie: cookieString,
42+
Referer: BASE_URL,
43+
},
44+
});
45+
46+
return html;
47+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
export interface EmbalsesMinoSil {
2+
id: number;
3+
embalse: string;
4+
capacidadTotalHm3: number;
5+
volumenActualHm3: number;
6+
fecha: string;
7+
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
export * from "./cuenca.api.js";
2+
export * from "./cuenca.model.js";
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
import { getEstadoCuencaMinoSil } from "./integration";
1+
import { scrapeCuencaMinioSil } from "./integration.js";
22

33
console.log("Estado de la Cuenca Miño Sil:");
4-
const result = await getEstadoCuencaMinoSil();
5-
console.log(JSON.stringify(result, null, 2));
4+
const result = await scrapeCuencaMinioSil();
5+
console.log(result);
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
export * from "./integration";
1+
export * from "./integration.js";
22

Lines changed: 12 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,13 @@
1-
import type { Embalse } from "db-model";
1+
import * as cheerio from "cheerio";
2+
import { EmbalseUpdateSAIHEntity } from "db-model";
3+
import { getCuencaPageHTMLContent } from "./api/index.js";
4+
import { extractProvinceTables } from "./scraper/index.js";
5+
import { mapToEmbalseUpdateSAIH } from './scraper/index.js';
26

3-
export const getEstadoCuencaMinoSil = async (): Promise<Embalse[]> => {
4-
return [
5-
{
6-
id: "1",
7-
nombre: "Embalse de Belesar",
8-
provincia: "Lugo",
9-
capacidad: 3000000000,
10-
nivelActual: 2500000000,
11-
fechaUltimoNivel: new Date("2023-10-01"),
12-
porcentajeLlenado: 83.3,
13-
},
14-
{
15-
id: "2",
16-
nombre: "Embalse de Velle",
17-
provincia: "Ourense",
18-
capacidad: 500000000,
19-
nivelActual: 400000000,
20-
fechaUltimoNivel: new Date("2023-10-01"),
21-
porcentajeLlenado: 80.0,
22-
},
23-
];
24-
};
7+
export async function scrapeCuencaMinioSil(): Promise<EmbalseUpdateSAIHEntity[]> {
8+
const html = await getCuencaPageHTMLContent();
9+
const $: cheerio.CheerioAPI = cheerio.load(html);
10+
const rawEmbalses = extractProvinceTables($);
11+
12+
return mapToEmbalseUpdateSAIH(rawEmbalses);
13+
}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import * as cheerio from "cheerio";
2+
import { EmbalsesMinoSil } from '../api/index.js';
3+
import { mapStringToApiDate } from './helpers.js';
4+
5+
export function parseEuropeanNumber(value: string): number {
6+
if (!value || value.trim() === "" || value === "*" || value === "n/d") {
7+
return NaN;
8+
}
9+
10+
// Replace comma with dot for decimal separator
11+
const normalizedValue = value.replace(",", ".");
12+
return parseFloat(normalizedValue);
13+
}
14+
15+
export function extractProvinceTables(
16+
$: cheerio.CheerioAPI
17+
): EmbalsesMinoSil[] {
18+
19+
let embalses: EmbalsesMinoSil[] = [];
20+
21+
$("table.tabla tr").each((_index, row) => {
22+
const cells = $(row).find("td");
23+
if (cells.length === 0) return; // Skip header rows with <th>
24+
25+
const nombre = $(cells[1]).text().trim();
26+
if (!nombre || !nombre.includes(" - ")) return; // Skip totals row
27+
const id = nombre.split(" - ")[0].trim().match(/^([A-Z])(\d+)$/)[2]; // Extract ID from name
28+
const capacidadTotal = $(cells[4]).text().trim();
29+
const fecha = $(cells[8]).text().trim();
30+
const volumenActual = $(cells[6]).text().trim();
31+
32+
const embalse: EmbalsesMinoSil = {
33+
id: Number(id),
34+
embalse: nombre.split(" - ")[1].trim(),
35+
capacidadTotalHm3: parseEuropeanNumber(capacidadTotal),
36+
volumenActualHm3: parseEuropeanNumber(volumenActual),
37+
fecha: mapStringToApiDate(fecha)
38+
};
39+
40+
embalses = [...embalses, embalse];
41+
});
42+
43+
return embalses;
44+
}

0 commit comments

Comments
 (0)