scripts/scraping/main.ts

102 lines
3.0 KiB
TypeScript

/**
scrapping des livres de la médiathèque de briis
**/
// @ts-ignore
import https from 'https';
import WriteFile from "./utils";
const axios = require('axios');
const cheerio = require('cheerio');
const url: string = "www.mediatheque-de-briis-sous-forges.net";
const fetching_path: string = "/mediatheque-de-briis-sous-forges.net/opac/recherche/catalogue?node=0&value=0&page=2";
interface Book {
author: string
title: string
description: string
img: string
}
const books: Book[] = [];
// autres pages:
// http://www.mediatheque-de-briis-sous-forges.net/mediatheque-de-briis-sous-forges.net/opac/recherche/catalogue?node=0&value=0&page=2
const page_max = 1927
const getTables = (html: string): any => {
const $ = cheerio.load(html);
const tableElements = $(
"table.notice"
);
return tableElements;
};
const getHtml = async (hostname: string, path: string): Promise<string> =>
new Promise((resolve, reject) => {
https
.get(
{
hostname,
path,
// port:80,
method: "GET",
},
(res) => {
let html = "";
res.on("data", function (chunk) {
html += chunk;
});
res.on("end", function () {
resolve(html);
});
}
)
.on("error", (error) => {
console.error(error);
reject(error);
});
});
// fetchData(url).then((res: any) => {
// const html = res.data;
// const $ = cheerio.load(html);
// const statsTable :any = $('table.notice');
// console.log('statsTable', statsTable)
// statsTable.each(function(){
// let elem:any = this;
// let author = $(elem).find('td').eq(2).text();
// // let img = $(this).find('img').attr('src');
// // console.log(elem);
// console.log(author);
// });
// })
function writeBookScrapping() {
WriteFile('books.json', JSON.stringify(books, null , 2))
}
getHtml(url, fetching_path)
.then(getTables)
.then(
(tables: any) => tables.each(
(_: any, table: any) => {
const $ = cheerio.load(table);
// console.log('une table')
// let author = $().find('a.notice').text();
let text_description = $(table).find('td').eq(1).text();
let boom = text_description.split('\n');
let splitting = boom[1].split('/')
let img_src = $(table).find('td img').attr('src');
console.log(img_src);
books.push({
author: boom[0],
title: splitting[0],
description: splitting[1],
img: img_src
})
// console.log(cheerio.load(table).html())
}
)
)
.then(writeBookScrapping)
.catch((error) => console.log(error));