Paste
Pasted as Python by bral ( 5 years ago )
from json import loads
from os import path, remove
from tempfile import gettempdir
from requests import get
from scrapy import Spider, Request
from scrapy.crawler import CrawlerProcess
from scrapy.item import Item, Field
from scrapy.pipelines.files import FilesPipeline
from ElasticSearch import ElasticSearch
from MinIO import MinIO
from secrets import MINIO_ACCESS_KEY, MINIO_SECRET_KEY
from variables import ES_HOST
from variables import MINIO_SERVER
MAX_TASKS = 10
SLEEP_TIME = 0.5 * 60
class CdnSpider(Spider):
name = "cdn_spider"
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.base = "урл"
self.es = ElasticSearch(hosts=[ES_HOST])
self.temp_dir = gettempdir()
self.aws = MinIO(
endpoint=MINIO_SERVER,
access_key=MINIO_ACCESS_KEY,
secret_key=MINIO_SECRET_KEY,
)
self.urls = [
obj["url"]
for obj in self.es.search(
index="videos",
body={
"query": {"match": {"video_url": "NULL"}},
"size": f"{MAX_TASKS}",
},
)
]
def start_requests(self):
for url in self.urls:
yield Request(
url=url,
callback=self.parse,
method="GET",
headers={
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/87.0.4280.88 Safari/537.36",
"content-type": "application/x-www-form-urlencoded",
},
)
def parse(self, response, **kwargs):
# страница не найдена
not_found = response.css(
"div[id=wrapper_div] > "
"div[id=content_div] > "
"div.main_content > "
"span.page_message::text "
).get()
if not_found == "Post Not Found":
self.es.delete_by_query(
index="videos",
body={"query": {"match": {"url": f"{response.url}"}}},
)
return
# видео удалено
content_removed = response.css(
"div[id=wrapper_div] > "
"div[id=content_div] > "
"div.main_content > "
"div.post_el_small.post_el_post > "
"div.post_el_wrap > "
"div:nth-child(4) > span::text"
).get()
if content_removed == "Sorry! Content removed.":
self.es.delete_by_query(
index="videos",
body={"query": {"match": {"url": f"{response.url}"}}},
)
return
data_vnfo = (
response.css("div[id=wrapper_div]")
.css("div[id=content_div]")
.css("div.main_content > span.vidsnfo::attr(data-vnfo)")
)
video_url = list(loads(data_vnfo.get()).values())[0]
video_url = f"{self.base}{decrypting(video_url)}"
url = f"{response.url}"
yield UrlsItem(video_url=video_url, url=url)
class UrlsItem(Item):
url = Field()
video_url = Field()
class VideoPipeline(FilesPipeline):
def process_item(self, item, spider):
temp = download_file(url=item["video_url"], directory=spider.temp_dir)
if spider.aws.upload_file(
filename=temp,
bucket_name="raw_videos",
filename_in_bucket=path.basename(temp),
):
raw_filepath = path.join("raw_videos", path.basename(temp)).replace(
"\\", "/"
)
if spider.es.update_by_query(
index="videos",
body={
"query": {"term": {"url": item["url"]}},
"script": {
"source": "ctx._source.video_url = params.video_url ; ctx._source.raw_filepath = params.raw_filepath",
"lang": "painless",
"params": {
"video_url": f"{item['video_url']}",
"raw_filepath": f"{raw_filepath}",
},
},
},
):
remove(temp)
def download_file(url: str, directory: str):
local_filename = url.split("/")[-1]
filename = path.join(directory, local_filename)
# NOTE the stream=True parameter below
with get(url, stream=True) as r:
r.raise_for_status()
with open(file=filename, mode="wb") as f:
for chunk in r.iter_content(chunk_size=1024 * 1024):
f.write(chunk)
return filename
def decrypting(link: str):
"""
:param link: урл вида '/cdn/c10/jy3p1xzk1u3e0xzs9l5yz71e5a3/06DNXKxRpMEDmZ1lqK1Iew/1610390527
/u5tb3dh8k0pb8ek68d21h4g0pdt/a5ffzfeb84yfle48iambcaaf81g.vid'
:return: /cdn8/c10/lt3j1uz21t3q09zr9f5ez41a5g3/hIMNrXNQPRe5S1L1bW7ciw/1610390562/
js5ebad38l09bree66dx1r4f0fd/p65ifkf5bx4gfbeh87a0biaofx1.vid
"""
buffer = link.replace("/cdn/", "/cdn8/").split("/")
bbuffer = buffer
c = int(buffer[5])
a = stripdown(buffer[6])
b = stripdown(buffer[7])
a = quersumm(a)
b = quersumm(b)
c = c - (a + b)
bbuffer[5] = str(c)
return "/".join([str(i) for i in bbuffer])
def stripdown(string_: str) -> str:
"""
:param string_: входная строка
:return: только цифры
"""
return "".join(c for c in string_ if c.isdigit())
def quersumm(string_: str) -> int:
"""
:param string_: входная строка
:return:
"""
return sum([int(i) for i in string_])
process = CrawlerProcess(
settings={
"FILES_STORE": gettempdir(),
"DOWNLOAD_DELAY": 5,
"CONCURRENT_REQUESTS": 10,
"CONCURRENT_ITEMS": 10,
"AUTOTHROTTLE_ENABLED": True,
"AUTOTHROTTLE_START_DELAY": 5,
"ITEM_PIPELINES": {VideoPipeline: 200},
}
)
process.crawl(CdnSpider, name="cdn_parser")
process.start()
Revise this Paste