Paste code

from json import loads
from os import path, remove
from tempfile import gettempdir

from requests import get
from scrapy import Spider, Request
from scrapy.crawler import CrawlerProcess
from scrapy.item import Item, Field
from scrapy.pipelines.files import FilesPipeline

from ElasticSearch import ElasticSearch
from MinIO import MinIO
from secrets import MINIO_ACCESS_KEY, MINIO_SECRET_KEY
from variables import ES_HOST
from variables import MINIO_SERVER

MAX_TASKS = 10
SLEEP_TIME = 0.5 * 60


class CdnSpider(Spider):
    name = "cdn_spider"

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.base = "урл"
        self.es = ElasticSearch(hosts=[ES_HOST])
        self.temp_dir = gettempdir()
        self.aws = MinIO(
            endpoint=MINIO_SERVER,
            access_key=MINIO_ACCESS_KEY,
            secret_key=MINIO_SECRET_KEY,
        )
        self.urls = [
            obj["url"]
            for obj in self.es.search(
                index="videos",
                body={
                    "query": {"match": {"video_url": "NULL"}},
                    "size": f"{MAX_TASKS}",
                },
            )
        ]

    def start_requests(self):
        for url in self.urls:
            yield Request(
                url=url,
                callback=self.parse,
                method="GET",
                headers={
                    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                    "AppleWebKit/537.36 (KHTML, like Gecko) "
                    "Chrome/87.0.4280.88 Safari/537.36",
                    "content-type": "application/x-www-form-urlencoded",
                },
            )

    def parse(self, response, **kwargs):

        # страница не найдена
        not_found = response.css(
            "div[id=wrapper_div] > "
            "div[id=content_div] > "
            "div.main_content > "
            "span.page_message::text "
        ).get()

        if not_found == "Post Not Found":
            self.es.delete_by_query(
                index="videos",
                body={"query": {"match": {"url": f"{response.url}"}}},
            )
            return

        # видео удалено
        content_removed = response.css(
            "div[id=wrapper_div] > "
            "div[id=content_div] > "
            "div.main_content > "
            "div.post_el_small.post_el_post > "
            "div.post_el_wrap > "
            "div:nth-child(4) > span::text"
        ).get()

        if content_removed == "Sorry! Content removed.":
            self.es.delete_by_query(
                index="videos",
                body={"query": {"match": {"url": f"{response.url}"}}},
            )
            return

        data_vnfo = (
            response.css("div[id=wrapper_div]")
            .css("div[id=content_div]")
            .css("div.main_content > span.vidsnfo::attr(data-vnfo)")
        )
        video_url = list(loads(data_vnfo.get()).values())[0]
        video_url = f"{self.base}{decrypting(video_url)}"
        url = f"{response.url}"

        yield UrlsItem(video_url=video_url, url=url)


class UrlsItem(Item):
    url = Field()
    video_url = Field()


class VideoPipeline(FilesPipeline):

    def process_item(self, item, spider):
        temp = download_file(url=item["video_url"], directory=spider.temp_dir)

        if spider.aws.upload_file(
            filename=temp,
            bucket_name="raw_videos",
            filename_in_bucket=path.basename(temp),
        ):
            raw_filepath = path.join("raw_videos", path.basename(temp)).replace(
                "\\", "/"
            )
            if spider.es.update_by_query(
                index="videos",
                body={
                    "query": {"term": {"url": item["url"]}},
                    "script": {
                        "source": "ctx._source.video_url = params.video_url ; ctx._source.raw_filepath = params.raw_filepath",
                        "lang": "painless",
                        "params": {
                            "video_url": f"{item['video_url']}",
                            "raw_filepath": f"{raw_filepath}",
                        },
                    },
                },
            ):
                remove(temp)


def download_file(url: str, directory: str):
    local_filename = url.split("/")[-1]
    filename = path.join(directory, local_filename)
    # NOTE the stream=True parameter below
    with get(url, stream=True) as r:
        r.raise_for_status()
        with open(file=filename, mode="wb") as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024):
                f.write(chunk)
    return filename


def decrypting(link: str):
    """
    :param link: урл вида '/cdn/c10/jy3p1xzk1u3e0xzs9l5yz71e5a3/06DNXKxRpMEDmZ1lqK1Iew/1610390527
                           /u5tb3dh8k0pb8ek68d21h4g0pdt/a5ffzfeb84yfle48iambcaaf81g.vid'
    :return: /cdn8/c10/lt3j1uz21t3q09zr9f5ez41a5g3/hIMNrXNQPRe5S1L1bW7ciw/1610390562/
             js5ebad38l09bree66dx1r4f0fd/p65ifkf5bx4gfbeh87a0biaofx1.vid
    """
    buffer = link.replace("/cdn/", "/cdn8/").split("/")
    bbuffer = buffer
    c = int(buffer[5])
    a = stripdown(buffer[6])
    b = stripdown(buffer[7])
    a = quersumm(a)
    b = quersumm(b)
    c = c - (a + b)
    bbuffer[5] = str(c)
    return "/".join([str(i) for i in bbuffer])


def stripdown(string_: str) -> str:
    """
    :param string_: входная строка
    :return: только цифры
    """
    return "".join(c for c in string_ if c.isdigit())


def quersumm(string_: str) -> int:
    """
    :param string_: входная строка
    :return:
    """
    return sum([int(i) for i in string_])


process = CrawlerProcess(
    settings={
        "FILES_STORE": gettempdir(),
        "DOWNLOAD_DELAY": 5,
        "CONCURRENT_REQUESTS": 10,
        "CONCURRENT_ITEMS": 10,
        "AUTOTHROTTLE_ENABLED": True,
        "AUTOTHROTTLE_START_DELAY": 5,
        "ITEM_PIPELINES": {VideoPipeline: 200},
    }
)
process.crawl(CdnSpider, name="cdn_parser")
process.start()