{ "cells": [ { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "import requests\n", "import urllib.parse\n", "import dateutil.parser\n", "from datetime import datetime\n", "from feedgen.feed import FeedGenerator" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "rus_month_to_eng = {\n", " 'января': 'jan',\n", " 'февраля': 'feb',\n", " 'марта': 'mar',\n", " 'апреля': 'apr',\n", " 'мая': 'may',\n", " 'июня': 'jun',\n", " 'июля': 'jul',\n", " 'августа': 'aug',\n", " 'сентябся': 'sep',\n", " 'октября': 'oct',\n", " 'ноября': 'nov',\n", " 'декабря': 'dec'\n", "}\n", "\n", "def parse_date(date_str):\n", " day, month_rus, year = date_str.lower().split(' ')\n", " res = datetime.strptime('-'.join([day, rus_month_to_eng[month_rus], year, '+03:00']), '%d-%b-%Y-%z')\n", " return res" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "headers = {\"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36\"}\n", "\n", "url = \"https://www.drive2.ru/experience/jetta/g644510077776067186?to=133933297133670330#results\"\n", "r = requests.get(url, headers=headers)\n", "\n", "soup = BeautifulSoup(r.content)\n", "\n", "fg = FeedGenerator()\n", "fg.id(url)\n", "fg.title(soup.find('title').get_text(strip=True))\n", "fg.link( href='http://www.drive2.ru', rel='alternate' )\n", "fg.language('ru')\n", "\n", "for article in soup.find_all('div', {'class': 'c-block-card'}):\n", " fe = fg.add_entry()\n", "\n", " author = article.find('div', {'class': 'c-car-card__owner'})\n", " car = article.find('div', {'class': 'c-car-card__caption'})\n", " final_author = f\"{author.get_text(strip=True)} ({car.get_text(strip=True)})\"\n", "\n", " fe.author({'name': final_author})\n", "\n", "\n", " post_title = article.find('a', {'data-ym-target': 'post_title'})\n", " fe.title(post_title.get_text(strip=True))\n", " post_url = post_title['href']\n", " id = [p for p in post_url.split('/') if p.strip()][-1]\n", " fe.id(id)\n", " full_url = urllib.parse.urljoin(url, post_url)\n", " fe.link({'href': full_url})\n", "\n", " article_r = requests.get(full_url, headers=headers)\n", " article_soup = BeautifulSoup(article_r.content)\n", " header = article_soup.find('header', {'class': 'x-title-header'})\n", " date_div = header.find('div', {'class': 'x-secondary-color'})\n", " date_txt = date_div.get_text(strip=True).lower()\n", " date = parse_date(date_txt)\n", " fe.pubDate(date)\n", "\n", " desc = article.find('div', {'class': 'c-post-preview__lead'})\n", " fe.description(desc.get_text(strip=True))\n", "\n", " preview_uri_div = article.find('div', {'class': 'c-preview-pic'})\n", " if (preview_uri_div):\n", " preview_uri = preview_uri_div.find('img')['src']\n", " fe.enclosure(preview_uri)\n", "\n", "fg.atom_file('result.xml')" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.7" } }, "nbformat": 4, "nbformat_minor": 2 }