drive2 feed

This commit is contained in:
Pan'kov Maksim
2025-09-13 00:18:39 +03:00
parent cca6e0547a
commit 029839154e
7 changed files with 465 additions and 0 deletions

225
services/drive2/.gitignore vendored Normal file
View File

@@ -0,0 +1,225 @@
# File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig
# Created by https://www.toptal.com/developers/gitignore/api/windows,visualstudiocode,python
# Edit at https://www.toptal.com/developers/gitignore?templates=windows,visualstudiocode,python
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
### Python Patch ###
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
poetry.toml
# ruff
.ruff_cache/
# LSP config files
pyrightconfig.json
### VisualStudioCode ###
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
!.vscode/*.code-snippets
# Local History for Visual Studio Code
.history/
# Built Visual Studio Code Extensions
*.vsix
### VisualStudioCode Patch ###
# Ignore all local history of files
.history
.ionide
### Windows ###
# Windows thumbnail cache files
Thumbs.db
Thumbs.db:encryptable
ehthumbs.db
ehthumbs_vista.db
# Dump file
*.stackdump
# Folder config file
[Dd]esktop.ini
# Recycle Bin used on file shares
$RECYCLE.BIN/
# Windows Installer files
*.cab
*.msi
*.msix
*.msm
*.msp
# Windows shortcuts
*.lnk
# End of https://www.toptal.com/developers/gitignore/api/windows,visualstudiocode,python
# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)

View File

@@ -0,0 +1,11 @@
FROM python:3.13
COPY ./reqs.txt /opt/feed-server/reqs.txt
RUN pip install --no-cache-dir --upgrade -r /opt/feed-server/reqs.txt
COPY ./app /opt/feed-server/app
WORKDIR /opt/feed-server/app
CMD ["fastapi", "run", "server.py", "--port", "80"]

View File

@@ -0,0 +1,81 @@
from bs4 import BeautifulSoup
import requests
import urllib.parse
import dateutil.parser
from datetime import datetime
from feedgen.feed import FeedGenerator
rus_month_to_eng = {
'января': 'jan',
'февраля': 'feb',
'марта': 'mar',
'апреля': 'apr',
'мая': 'may',
'июня': 'jun',
'июля': 'jul',
'августа': 'aug',
'сентября': 'sep',
'октября': 'oct',
'ноября': 'nov',
'декабря': 'dec'
}
def parse_date(date_str):
# print(date_str)
day, month_rus, year = date_str.lower().split(' ')
res = datetime.strptime('-'.join([day, rus_month_to_eng[month_rus], year, '+03:00']), '%d-%b-%Y-%z')
return res
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36"}
def generate(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, features='lxml')
fg = FeedGenerator()
fg.id(url)
fg.title(soup.find('title').get_text(strip=True))
fg.link( href='http://www.drive2.ru', rel='alternate' )
fg.language('ru')
for article in soup.find_all('div', {'class': 'c-block-card'}):
fe = fg.add_entry()
author = article.find('div', {'class': 'c-car-card__owner'})
car = article.find('div', {'class': 'c-car-card__caption'})
final_author = f"{author.get_text(strip=True)} ({car.get_text(strip=True)})"
fe.author({'name': final_author})
post_title = article.find('a', {'data-ym-target': 'post_title'})
fe.title(post_title.get_text(strip=True))
post_url = post_title['href']
id = [p for p in post_url.split('/') if p.strip()][-1]
fe.id(id)
full_url = urllib.parse.urljoin(url, post_url)
fe.link({'href': full_url})
article_r = requests.get(full_url, headers=headers)
article_soup = BeautifulSoup(article_r.content, features='lxml')
header = article_soup.find('header', {'class': 'x-title-header'})
if header:
date_div = header.find('div', {'class': 'x-secondary-color'})
if date_div:
date_txt = date_div.get_text(strip=True).lower()
date = parse_date(date_txt)
fe.pubDate(date)
desc = article.find('div', {'class': 'c-post-preview__lead'})
description = desc.get_text(strip=True).rstrip('Читать дальше')
preview_uri_div = article.find('div', {'class': 'c-preview-pic'})
if (preview_uri_div):
preview_uri = preview_uri_div.find('img')['src']
# fe.enclosure(preview_uri)
description = f"<img src='{preview_uri}' referrerpolicy='no-referrer' /> {description}"
fe.summary(description, type='html')
return fg.atom_str()

View File

@@ -0,0 +1,12 @@
from fastapi import FastAPI, Response
from generator import generate
from cachetools import cached, TTLCache
import math
app = FastAPI()
@app.get("/")
@cached(cache=TTLCache(maxsize=math.inf, ttl=3600))
def read_root(url: str):
atom = generate(url)
return Response(content=atom, media_type='application/atom+xml; charset=utf-8')

122
services/drive2/dev.ipynb Normal file
View File

@@ -0,0 +1,122 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"import requests\n",
"import urllib.parse\n",
"import dateutil.parser\n",
"from datetime import datetime\n",
"from feedgen.feed import FeedGenerator"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"rus_month_to_eng = {\n",
" 'января': 'jan',\n",
" 'февраля': 'feb',\n",
" 'марта': 'mar',\n",
" 'апреля': 'apr',\n",
" 'мая': 'may',\n",
" 'июня': 'jun',\n",
" 'июля': 'jul',\n",
" 'августа': 'aug',\n",
" 'сентябся': 'sep',\n",
" 'октября': 'oct',\n",
" 'ноября': 'nov',\n",
" 'декабря': 'dec'\n",
"}\n",
"\n",
"def parse_date(date_str):\n",
" day, month_rus, year = date_str.lower().split(' ')\n",
" res = datetime.strptime('-'.join([day, rus_month_to_eng[month_rus], year, '+03:00']), '%d-%b-%Y-%z')\n",
" return res"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"headers = {\"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36\"}\n",
"\n",
"url = \"https://www.drive2.ru/experience/jetta/g644510077776067186?to=133933297133670330#results\"\n",
"r = requests.get(url, headers=headers)\n",
"\n",
"soup = BeautifulSoup(r.content)\n",
"\n",
"fg = FeedGenerator()\n",
"fg.id(url)\n",
"fg.title(soup.find('title').get_text(strip=True))\n",
"fg.link( href='http://www.drive2.ru', rel='alternate' )\n",
"fg.language('ru')\n",
"\n",
"for article in soup.find_all('div', {'class': 'c-block-card'}):\n",
" fe = fg.add_entry()\n",
"\n",
" author = article.find('div', {'class': 'c-car-card__owner'})\n",
" car = article.find('div', {'class': 'c-car-card__caption'})\n",
" final_author = f\"{author.get_text(strip=True)} ({car.get_text(strip=True)})\"\n",
"\n",
" fe.author({'name': final_author})\n",
"\n",
"\n",
" post_title = article.find('a', {'data-ym-target': 'post_title'})\n",
" fe.title(post_title.get_text(strip=True))\n",
" post_url = post_title['href']\n",
" id = [p for p in post_url.split('/') if p.strip()][-1]\n",
" fe.id(id)\n",
" full_url = urllib.parse.urljoin(url, post_url)\n",
" fe.link({'href': full_url})\n",
"\n",
" article_r = requests.get(full_url, headers=headers)\n",
" article_soup = BeautifulSoup(article_r.content)\n",
" header = article_soup.find('header', {'class': 'x-title-header'})\n",
" date_div = header.find('div', {'class': 'x-secondary-color'})\n",
" date_txt = date_div.get_text(strip=True).lower()\n",
" date = parse_date(date_txt)\n",
" fe.pubDate(date)\n",
"\n",
" desc = article.find('div', {'class': 'c-post-preview__lead'})\n",
" fe.description(desc.get_text(strip=True))\n",
"\n",
" preview_uri_div = article.find('div', {'class': 'c-preview-pic'})\n",
" if (preview_uri_div):\n",
" preview_uri = preview_uri_div.find('img')['src']\n",
" fe.enclosure(preview_uri)\n",
"\n",
"fg.atom_file('result.xml')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,9 @@
services:
feed-server:
image: feed-server
build:
context: .
ports:
- 8890:80
# http://localhost:8890/?url=https://www.drive2.ru/experience/jetta/g644510077776067186

5
services/drive2/reqs.txt Normal file
View File

@@ -0,0 +1,5 @@
beautifulsoup4==4.13.5
requests==2.32.5
feedgen==1.0.0
fastapi[standard]
cachetools==6.2.0