drive2 feed

2025-09-13 00:18:39 +03:00
parent cca6e0547a
commit 029839154e
7 changed files with 465 additions and 0 deletions
@@ -0,0 +1,122 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from bs4 import BeautifulSoup\n",
+    "import requests\n",
+    "import urllib.parse\n",
+    "import dateutil.parser\n",
+    "from datetime import datetime\n",
+    "from feedgen.feed import FeedGenerator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rus_month_to_eng = {\n",
+    "    'января': 'jan',\n",
+    "    'февраля': 'feb',\n",
+    "    'марта': 'mar',\n",
+    "    'апреля': 'apr',\n",
+    "    'мая': 'may',\n",
+    "    'июня': 'jun',\n",
+    "    'июля': 'jul',\n",
+    "    'августа': 'aug',\n",
+    "    'сентябся': 'sep',\n",
+    "    'октября': 'oct',\n",
+    "    'ноября': 'nov',\n",
+    "    'декабря': 'dec'\n",
+    "}\n",
+    "\n",
+    "def parse_date(date_str):\n",
+    "    day, month_rus, year = date_str.lower().split(' ')\n",
+    "    res = datetime.strptime('-'.join([day, rus_month_to_eng[month_rus], year, '+03:00']), '%d-%b-%Y-%z')\n",
+    "    return res"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "headers = {\"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36\"}\n",
+    "\n",
+    "url = \"https://www.drive2.ru/experience/jetta/g644510077776067186?to=133933297133670330#results\"\n",
+    "r = requests.get(url, headers=headers)\n",
+    "\n",
+    "soup = BeautifulSoup(r.content)\n",
+    "\n",
+    "fg = FeedGenerator()\n",
+    "fg.id(url)\n",
+    "fg.title(soup.find('title').get_text(strip=True))\n",
+    "fg.link( href='http://www.drive2.ru', rel='alternate' )\n",
+    "fg.language('ru')\n",
+    "\n",
+    "for article in soup.find_all('div', {'class': 'c-block-card'}):\n",
+    "    fe = fg.add_entry()\n",
+    "\n",
+    "    author = article.find('div', {'class': 'c-car-card__owner'})\n",
+    "    car = article.find('div', {'class': 'c-car-card__caption'})\n",
+    "    final_author = f\"{author.get_text(strip=True)} ({car.get_text(strip=True)})\"\n",
+    "\n",
+    "    fe.author({'name': final_author})\n",
+    "\n",
+    "\n",
+    "    post_title = article.find('a', {'data-ym-target': 'post_title'})\n",
+    "    fe.title(post_title.get_text(strip=True))\n",
+    "    post_url = post_title['href']\n",
+    "    id = [p for p in post_url.split('/') if p.strip()][-1]\n",
+    "    fe.id(id)\n",
+    "    full_url = urllib.parse.urljoin(url, post_url)\n",
+    "    fe.link({'href': full_url})\n",
+    "\n",
+    "    article_r = requests.get(full_url, headers=headers)\n",
+    "    article_soup = BeautifulSoup(article_r.content)\n",
+    "    header = article_soup.find('header', {'class': 'x-title-header'})\n",
+    "    date_div = header.find('div', {'class': 'x-secondary-color'})\n",
+    "    date_txt = date_div.get_text(strip=True).lower()\n",
+    "    date = parse_date(date_txt)\n",
+    "    fe.pubDate(date)\n",
+    "\n",
+    "    desc = article.find('div', {'class': 'c-post-preview__lead'})\n",
+    "    fe.description(desc.get_text(strip=True))\n",
+    "\n",
+    "    preview_uri_div = article.find('div', {'class': 'c-preview-pic'})\n",
+    "    if (preview_uri_div):\n",
+    "        preview_uri = preview_uri_div.find('img')['src']\n",
+    "        fe.enclosure(preview_uri)\n",
+    "\n",
+    "fg.atom_file('result.xml')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}