scripts/pirate_program/all_pages_scrapping.py

48 lines
1.4 KiB
Python
Raw Permalink Normal View History

2024-06-10 15:42:10 +02:00
import requests
from bs4 import BeautifulSoup
import json
import codecs
import time
def scrape_page(url):
# Récupérer le contenu HTML de la page
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
# Extraire les informations de la page
title = soup.select_one("#main figure h1").text.strip()
tags = ", ".join([a.text.strip() for a in soup.select_one("#main figure ul").find_all("a")])
image_url = soup.select_one("#main figure img")["src"]
date = soup.select_one("#main #article-date").text.strip()
content = soup.select_one("#main #content").text.strip()
html_content = str(soup.select_one("#main #content"))
return {
"title": title,
"tags": tags,
"image": image_url,
"date": date,
"content": content,
"html_content": html_content,
"url": url
}
# Charger les URL depuis le fichier JSON
with open("partipirate_links.json", "r", encoding="utf-8") as f:
urls = json.load(f)
# Boucler sur les URL
all_pages = []
counter=1
for url in urls:
if counter < 1000:
print(f"{counter} / {len(urls)}, récupération de la page ", url)
page = scrape_page(url)
all_pages.append(page)
time.sleep(0.2)
counter+=1
# Enregistrer les informations dans le fichier JSON
with open("all_pages.json", "w", encoding="utf-8") as f:
json.dump(all_pages, f, ensure_ascii=False, indent=4)