diff --git a/services/drive2/.gitignore b/services/drive2/.gitignore new file mode 100644 index 0000000..e0c5b46 --- /dev/null +++ b/services/drive2/.gitignore @@ -0,0 +1,225 @@ +# File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig +# Created by https://www.toptal.com/developers/gitignore/api/windows,visualstudiocode,python +# Edit at https://www.toptal.com/developers/gitignore?templates=windows,visualstudiocode,python + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +### Windows ### +# Windows thumbnail cache files +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msix +*.msm +*.msp + +# Windows shortcuts +*.lnk + +# End of https://www.toptal.com/developers/gitignore/api/windows,visualstudiocode,python + +# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option) + diff --git a/services/drive2/Dockerfile b/services/drive2/Dockerfile new file mode 100644 index 0000000..9629e5f --- /dev/null +++ b/services/drive2/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.13 + +COPY ./reqs.txt /opt/feed-server/reqs.txt + +RUN pip install --no-cache-dir --upgrade -r /opt/feed-server/reqs.txt + +COPY ./app /opt/feed-server/app + +WORKDIR /opt/feed-server/app + +CMD ["fastapi", "run", "server.py", "--port", "80"] \ No newline at end of file diff --git a/services/drive2/app/generator.py b/services/drive2/app/generator.py new file mode 100644 index 0000000..3e7a7b9 --- /dev/null +++ b/services/drive2/app/generator.py @@ -0,0 +1,81 @@ +from bs4 import BeautifulSoup +import requests +import urllib.parse +import dateutil.parser +from datetime import datetime +from feedgen.feed import FeedGenerator + +rus_month_to_eng = { + 'января': 'jan', + 'февраля': 'feb', + 'марта': 'mar', + 'апреля': 'apr', + 'мая': 'may', + 'июня': 'jun', + 'июля': 'jul', + 'августа': 'aug', + 'сентября': 'sep', + 'октября': 'oct', + 'ноября': 'nov', + 'декабря': 'dec' +} + +def parse_date(date_str): + # print(date_str) + day, month_rus, year = date_str.lower().split(' ') + res = datetime.strptime('-'.join([day, rus_month_to_eng[month_rus], year, '+03:00']), '%d-%b-%Y-%z') + return res + +headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36"} + +def generate(url): + r = requests.get(url, headers=headers) + + soup = BeautifulSoup(r.content, features='lxml') + + fg = FeedGenerator() + fg.id(url) + fg.title(soup.find('title').get_text(strip=True)) + fg.link( href='http://www.drive2.ru', rel='alternate' ) + fg.language('ru') + + for article in soup.find_all('div', {'class': 'c-block-card'}): + fe = fg.add_entry() + + author = article.find('div', {'class': 'c-car-card__owner'}) + car = article.find('div', {'class': 'c-car-card__caption'}) + final_author = f"{author.get_text(strip=True)} ({car.get_text(strip=True)})" + + fe.author({'name': final_author}) + + + post_title = article.find('a', {'data-ym-target': 'post_title'}) + fe.title(post_title.get_text(strip=True)) + post_url = post_title['href'] + id = [p for p in post_url.split('/') if p.strip()][-1] + fe.id(id) + full_url = urllib.parse.urljoin(url, post_url) + fe.link({'href': full_url}) + + article_r = requests.get(full_url, headers=headers) + article_soup = BeautifulSoup(article_r.content, features='lxml') + header = article_soup.find('header', {'class': 'x-title-header'}) + if header: + date_div = header.find('div', {'class': 'x-secondary-color'}) + if date_div: + date_txt = date_div.get_text(strip=True).lower() + date = parse_date(date_txt) + fe.pubDate(date) + + desc = article.find('div', {'class': 'c-post-preview__lead'}) + description = desc.get_text(strip=True).rstrip('Читать дальше') + + preview_uri_div = article.find('div', {'class': 'c-preview-pic'}) + if (preview_uri_div): + preview_uri = preview_uri_div.find('img')['src'] + # fe.enclosure(preview_uri) + description = f" {description}" + + fe.summary(description, type='html') + + return fg.atom_str() \ No newline at end of file diff --git a/services/drive2/app/server.py b/services/drive2/app/server.py new file mode 100644 index 0000000..0b13ef2 --- /dev/null +++ b/services/drive2/app/server.py @@ -0,0 +1,12 @@ +from fastapi import FastAPI, Response +from generator import generate +from cachetools import cached, TTLCache +import math + +app = FastAPI() + +@app.get("/") +@cached(cache=TTLCache(maxsize=math.inf, ttl=3600)) +def read_root(url: str): + atom = generate(url) + return Response(content=atom, media_type='application/atom+xml; charset=utf-8') diff --git a/services/drive2/dev.ipynb b/services/drive2/dev.ipynb new file mode 100644 index 0000000..8d389bf --- /dev/null +++ b/services/drive2/dev.ipynb @@ -0,0 +1,122 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "from bs4 import BeautifulSoup\n", + "import requests\n", + "import urllib.parse\n", + "import dateutil.parser\n", + "from datetime import datetime\n", + "from feedgen.feed import FeedGenerator" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "rus_month_to_eng = {\n", + " 'января': 'jan',\n", + " 'февраля': 'feb',\n", + " 'марта': 'mar',\n", + " 'апреля': 'apr',\n", + " 'мая': 'may',\n", + " 'июня': 'jun',\n", + " 'июля': 'jul',\n", + " 'августа': 'aug',\n", + " 'сентябся': 'sep',\n", + " 'октября': 'oct',\n", + " 'ноября': 'nov',\n", + " 'декабря': 'dec'\n", + "}\n", + "\n", + "def parse_date(date_str):\n", + " day, month_rus, year = date_str.lower().split(' ')\n", + " res = datetime.strptime('-'.join([day, rus_month_to_eng[month_rus], year, '+03:00']), '%d-%b-%Y-%z')\n", + " return res" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "headers = {\"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36\"}\n", + "\n", + "url = \"https://www.drive2.ru/experience/jetta/g644510077776067186?to=133933297133670330#results\"\n", + "r = requests.get(url, headers=headers)\n", + "\n", + "soup = BeautifulSoup(r.content)\n", + "\n", + "fg = FeedGenerator()\n", + "fg.id(url)\n", + "fg.title(soup.find('title').get_text(strip=True))\n", + "fg.link( href='http://www.drive2.ru', rel='alternate' )\n", + "fg.language('ru')\n", + "\n", + "for article in soup.find_all('div', {'class': 'c-block-card'}):\n", + " fe = fg.add_entry()\n", + "\n", + " author = article.find('div', {'class': 'c-car-card__owner'})\n", + " car = article.find('div', {'class': 'c-car-card__caption'})\n", + " final_author = f\"{author.get_text(strip=True)} ({car.get_text(strip=True)})\"\n", + "\n", + " fe.author({'name': final_author})\n", + "\n", + "\n", + " post_title = article.find('a', {'data-ym-target': 'post_title'})\n", + " fe.title(post_title.get_text(strip=True))\n", + " post_url = post_title['href']\n", + " id = [p for p in post_url.split('/') if p.strip()][-1]\n", + " fe.id(id)\n", + " full_url = urllib.parse.urljoin(url, post_url)\n", + " fe.link({'href': full_url})\n", + "\n", + " article_r = requests.get(full_url, headers=headers)\n", + " article_soup = BeautifulSoup(article_r.content)\n", + " header = article_soup.find('header', {'class': 'x-title-header'})\n", + " date_div = header.find('div', {'class': 'x-secondary-color'})\n", + " date_txt = date_div.get_text(strip=True).lower()\n", + " date = parse_date(date_txt)\n", + " fe.pubDate(date)\n", + "\n", + " desc = article.find('div', {'class': 'c-post-preview__lead'})\n", + " fe.description(desc.get_text(strip=True))\n", + "\n", + " preview_uri_div = article.find('div', {'class': 'c-preview-pic'})\n", + " if (preview_uri_div):\n", + " preview_uri = preview_uri_div.find('img')['src']\n", + " fe.enclosure(preview_uri)\n", + "\n", + "fg.atom_file('result.xml')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/services/drive2/docker-compose.yaml b/services/drive2/docker-compose.yaml new file mode 100644 index 0000000..c4a315a --- /dev/null +++ b/services/drive2/docker-compose.yaml @@ -0,0 +1,9 @@ +services: + feed-server: + image: feed-server + build: + context: . + ports: + - 8890:80 + +# http://localhost:8890/?url=https://www.drive2.ru/experience/jetta/g644510077776067186 \ No newline at end of file diff --git a/services/drive2/reqs.txt b/services/drive2/reqs.txt new file mode 100644 index 0000000..1b05138 --- /dev/null +++ b/services/drive2/reqs.txt @@ -0,0 +1,5 @@ +beautifulsoup4==4.13.5 +requests==2.32.5 +feedgen==1.0.0 +fastapi[standard] +cachetools==6.2.0 \ No newline at end of file