server/net: improve youtube-dl functionality, enforce size limits

This commit is contained in:
Shyam Sunder 2021-01-05 15:18:34 -05:00
parent 2dfd1c2192
commit c7461c7f65
4 changed files with 54 additions and 65 deletions

View file

@ -21,7 +21,7 @@ RUN apk --no-cache add \
&& pip3 install --no-cache-dir --disable-pip-version-check \
alembic \
"coloredlogs==5.0" \
youtube-dl \
youtube_dl \
&& apk --no-cache del py3-pip
COPY ./ /opt/app/

View file

@ -9,4 +9,4 @@ pillow>=4.3.0
pynacl>=1.2.1
pytz>=2018.3
pyRFC3339>=1.0
youtube-dl
youtube_dl

View file

@ -1,76 +1,75 @@
import json
import logging
import os
import subprocess
import urllib.error
import urllib.request
from tempfile import NamedTemporaryFile
from threading import Thread
from typing import Any, Dict, List
from youtube_dl import YoutubeDL
from youtube_dl.utils import YoutubeDLError
from szurubooru import config, errors
from szurubooru.func import mime, util
logger = logging.getLogger(__name__)
_dl_chunk_size = 2 ** 15
class DownloadError(errors.ProcessingError):
pass
class DownloadTooLargeError(DownloadError):
pass
def download(url: str, use_video_downloader: bool = False) -> bytes:
assert url
if use_video_downloader:
url = _get_youtube_dl_content_url(url)
request = urllib.request.Request(url)
if config.config["user_agent"]:
request.add_header("User-Agent", config.config["user_agent"])
request.add_header("Referer", url)
try:
content_buffer = b""
length_tally = 0
with urllib.request.urlopen(request) as handle:
content = handle.read()
except Exception as ex:
raise errors.ProcessingError("Error downloading %s (%s)" % (url, ex))
if (
use_video_downloader
and mime.get_mime_type(content) == "application/octet-stream"
):
return _youtube_dl_wrapper(url)
return content
def _youtube_dl_wrapper(url: str) -> bytes:
outpath = os.path.join(
config.config["data_dir"],
"temporary-uploads",
"youtubedl-" + util.get_sha1(url)[0:8] + ".dat",
)
options = {
"ignoreerrors": False,
"format": "best[ext=webm]/best[ext=mp4]/best[ext=flv]",
"logger": logger,
"max_filesize": config.config["max_dl_filesize"],
"max_downloads": 1,
"outtmpl": outpath,
}
while True:
try:
with YoutubeDL(options) as ydl:
ydl.extract_info(url, download=True)
with open(outpath, "rb") as f:
return f.read()
except YoutubeDLError as ex:
raise errors.ThirdPartyError(
"Error downloading video %s (%s)" % (url, ex)
chunk = handle.read(_dl_chunk_size)
except Exception:
raise DownloadError(url) from None
if not chunk:
break
length_tally += len(chunk)
if length_tally > config.config["max_dl_filesize"]:
raise DownloadTooLargeError(url)
content_buffer += chunk
return content_buffer
def _get_youtube_dl_content_url(url: str) -> str:
cmd = ["youtube-dl", "--format", "best"]
if config.config["user_agent"]:
cmd.extend(["--user-agent", config.config["user_agent"]])
cmd.extend(["--get-url", url])
try:
return (
subprocess.run(cmd, text=True, capture_output=True, check=True)
.stdout.split("\n")[0]
.strip()
)
except FileNotFoundError:
except subprocess.CalledProcessError:
raise errors.ThirdPartyError(
"Error downloading video %s (file could not be saved)" % (url)
)
"Could not extract content location from %s" % (url)
) from None
def post_to_webhooks(payload: Dict[str, Any]) -> List[Thread]:
threads = [
Thread(target=_post_to_webhook, args=(webhook, payload))
Thread(target=_post_to_webhook, args=(webhook, payload), daemon=False)
for webhook in (config.config["webhooks"] or [])
]
for thread in threads:
thread.daemon = False
thread.start()
return threads

View file

@ -1,6 +1,3 @@
from datetime import datetime
from unittest.mock import patch
import pytest
from szurubooru import errors
@ -69,41 +66,34 @@ def test_download():
"url",
[
"https://samples.ffmpeg.org/MPEG-4/video.mp4",
"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
],
)
def test_too_large_download(url):
pytest.xfail("Download limit not implemented yet")
with pytest.raises(errors.ProcessingError):
net.download(url)
with pytest.raises(net.DownloadTooLargeError):
net.download(url, use_video_downloader=True)
@pytest.mark.parametrize(
"url,expected_sha1",
[
(
"https://www.youtube.com/watch?v=C0DPdy98e4c",
"365af1c8f59c6865e1a84c6e13e3e25ff89e0ba1",
"https://gfycat.com/immaterialchillyiberianmole",
"0125976d2439e651b6863438db30de58f79f7754",
),
(
"https://gfycat.com/immaterialchillyiberianmole",
"953000e81d7bd1da95ce264f872e7b6c4a6484be",
"https://upload.wikimedia.org/wikipedia/commons/a/ad/Utah_teapot.png", # noqa: E501
"cfadcbdeda1204dc1363ee5c1969191f26be2e41",
),
],
)
def test_video_download(url, expected_sha1):
pytest.xfail("Current youtube-dl implementation is unstable")
def test_content_download(url, expected_sha1):
actual_content = net.download(url, use_video_downloader=True)
assert get_sha1(actual_content) == expected_sha1
@pytest.mark.parametrize(
"url",
[
"https://samples.ffmpeg.org/flac/short.flac", # not a video
"https://www.youtube.com/watch?v=dQw4w9WgXcQ", # video too large
],
)
def test_failed_video_download(url):
def test_bad_content_downlaod():
url = "http://info.cern.ch/hypertext/WWW/TheProject.html"
with pytest.raises(errors.ThirdPartyError):
net.download(url, use_video_downloader=True)