drive2 feed

This commit is contained in:
Pan'kov Maksim
2025-09-13 00:18:39 +03:00
parent cca6e0547a
commit 029839154e
7 changed files with 465 additions and 0 deletions

122
services/drive2/dev.ipynb Normal file
View File

@@ -0,0 +1,122 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"import requests\n",
"import urllib.parse\n",
"import dateutil.parser\n",
"from datetime import datetime\n",
"from feedgen.feed import FeedGenerator"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"rus_month_to_eng = {\n",
" 'января': 'jan',\n",
" 'февраля': 'feb',\n",
" 'марта': 'mar',\n",
" 'апреля': 'apr',\n",
" 'мая': 'may',\n",
" 'июня': 'jun',\n",
" 'июля': 'jul',\n",
" 'августа': 'aug',\n",
" 'сентябся': 'sep',\n",
" 'октября': 'oct',\n",
" 'ноября': 'nov',\n",
" 'декабря': 'dec'\n",
"}\n",
"\n",
"def parse_date(date_str):\n",
" day, month_rus, year = date_str.lower().split(' ')\n",
" res = datetime.strptime('-'.join([day, rus_month_to_eng[month_rus], year, '+03:00']), '%d-%b-%Y-%z')\n",
" return res"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"headers = {\"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36\"}\n",
"\n",
"url = \"https://www.drive2.ru/experience/jetta/g644510077776067186?to=133933297133670330#results\"\n",
"r = requests.get(url, headers=headers)\n",
"\n",
"soup = BeautifulSoup(r.content)\n",
"\n",
"fg = FeedGenerator()\n",
"fg.id(url)\n",
"fg.title(soup.find('title').get_text(strip=True))\n",
"fg.link( href='http://www.drive2.ru', rel='alternate' )\n",
"fg.language('ru')\n",
"\n",
"for article in soup.find_all('div', {'class': 'c-block-card'}):\n",
" fe = fg.add_entry()\n",
"\n",
" author = article.find('div', {'class': 'c-car-card__owner'})\n",
" car = article.find('div', {'class': 'c-car-card__caption'})\n",
" final_author = f\"{author.get_text(strip=True)} ({car.get_text(strip=True)})\"\n",
"\n",
" fe.author({'name': final_author})\n",
"\n",
"\n",
" post_title = article.find('a', {'data-ym-target': 'post_title'})\n",
" fe.title(post_title.get_text(strip=True))\n",
" post_url = post_title['href']\n",
" id = [p for p in post_url.split('/') if p.strip()][-1]\n",
" fe.id(id)\n",
" full_url = urllib.parse.urljoin(url, post_url)\n",
" fe.link({'href': full_url})\n",
"\n",
" article_r = requests.get(full_url, headers=headers)\n",
" article_soup = BeautifulSoup(article_r.content)\n",
" header = article_soup.find('header', {'class': 'x-title-header'})\n",
" date_div = header.find('div', {'class': 'x-secondary-color'})\n",
" date_txt = date_div.get_text(strip=True).lower()\n",
" date = parse_date(date_txt)\n",
" fe.pubDate(date)\n",
"\n",
" desc = article.find('div', {'class': 'c-post-preview__lead'})\n",
" fe.description(desc.get_text(strip=True))\n",
"\n",
" preview_uri_div = article.find('div', {'class': 'c-preview-pic'})\n",
" if (preview_uri_div):\n",
" preview_uri = preview_uri_div.find('img')['src']\n",
" fe.enclosure(preview_uri)\n",
"\n",
"fg.atom_file('result.xml')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}