rainclassroom-scrapper/video_processing.py at 969cdb10e869711f4eee473f101127aefaec3bc4

Fork: 0
physicsdolphin / rainclassroom-scrapper
Find file
Newer
Older
rainclassroom-scrapper / video_processing.py
physicsdolphin on 14 Jan 2025 14 KB reverted double scan, add hw-encode flag and typo fix
Raw Blame History
import os
import re
import subprocess
import sys
import time
import traceback
from concurrent.futures import ThreadPoolExecutor, as_completed
import shutil

FFMPEG_PATH = "ffmpeg" if shutil.which("ffmpeg") else os.path.join(os.getcwd(), "ffmpeg")
ARIA2C_PATH = "aria2c" if shutil.which("aria2c") else os.path.join(os.getcwd(), "aria2c")
WINDOWS = sys.platform == 'win32'


def download_segment(CACHE_FOLDER, url: str, order: int, name_prefix: str = "") -> subprocess.CompletedProcess:
    print(f"Downloading {name_prefix} - {order}")

    video_download_command = (f"{ARIA2C_PATH} -o '{CACHE_FOLDER}/{name_prefix}-{order}.mp4'"
                              f" -x 16 -s 16 -k 1M '{url}' --stream-piece-selector random -k 1M -c -l aria2c_video.log --log-level warn")

    if WINDOWS:
        result = subprocess.run(['powershell', '-Command', video_download_command], text=True)
    else:
        result = subprocess.run(video_download_command, shell=True)

    return result.returncode


def download_segment_idm(CACHE_FOLDER, url: str, order: int, name_prefix: str = ""):
    print(f"Downloading {name_prefix} - {order}")

    name_prefix_idm = name_prefix.replace('/', '\\')

    # Define the full path for the downloaded file
    downloaded_file = os.path.join(CACHE_FOLDER, f"{name_prefix_idm}-{order}.mp4")
    print("Downloading to", downloaded_file)

    if os.path.exists(downloaded_file):
        print(f"Skipping {downloaded_file} - Video already present")
        time.sleep(0.25)
        return downloaded_file

    # Construct the IDM download command
    video_download_command = (
        f"idman /n /d \"{url}\" /p \"$(pwd)\" /f \"{downloaded_file}\""
    )

    # Start the IDM download process
    subprocess.run(['powershell', '-Command', video_download_command], text=True)

    # Wait for the file to appear and its download to complete
    print(f"Waiting for {downloaded_file} to finish downloading...")
    try:
        # Wait until the file is created
        while not os.path.exists(downloaded_file):
            time.sleep(1)  # Wait until the file is created

        # Wait until the file stops growing
        prev_size = -1
        while True:
            curr_size = os.path.getsize(downloaded_file)
            if curr_size == prev_size:
                break  # File size hasn't changed, download likely complete
            prev_size = curr_size
            time.sleep(1)  # Check every second

    except KeyboardInterrupt:
        print("\nDownload interrupted by user.")
        # Optionally, clean up the partially downloaded file
        if os.path.exists(downloaded_file):
            print(f"Removing incomplete file: {downloaded_file}")
            os.remove(downloaded_file)
        raise  # Re-raise the exception to propagate it

    print(f"Download completed: {downloaded_file}")
    return 0


def download_segment_m3u8(idm_flag, CACHE_FOLDER, url: str, order: int, name_prefix: str = "", max_retries: int = 35):
    print(f"Downloading {name_prefix} - {order}")
    print(f"Downloading from {url}")

    # Initial download attempt with 32 workers

    # video_download_command = (f".\\ffmpeg -i '{url}' -c copy -n '{CACHE_FOLDER}/{name_prefix}-{order}.mp4'"
    #                           f" -hide_banner -loglevel error -stats")

    # video_download_command = (f".\\HLSDownloader -u '{url}' -o '{CACHE_FOLDER}/{name_prefix}-{order}.mp4'"
    #                           f" -w 32 -workers 32 ")

    # video_download_command = (f".\\m3u8dl-windows-amd64.exe -i '{url}' -o '{CACHE_FOLDER}/{name_prefix}-{order}.mp4'"
    #                           f" -retry 1 -t '{CACHE_FOLDER}' -thread 32")

    # video_download_command = (
    #     f".\\vsd.exe save '{url}' -o '{CACHE_FOLDER}/{name_prefix}-{order}.mp4' "
    #     f"-d {CACHE_FOLDER} --retry-count 15 -t 4")

    output_path = f"{CACHE_FOLDER}/{name_prefix}-{order}"
    save_dir = os.path.dirname(output_path)
    save_name = os.path.basename(output_path)

    if 'mp3' in url or not WINDOWS:
        if idm_flag:
            video_download_command = (
                f"idman /n /d \"{url}\" /p \"$(pwd)\" /f '{CACHE_FOLDER}/{name_prefix}-{order}.mp4'"
            )
        else:
            video_download_command = (
                f"{FFMPEG_PATH} -i '{url}' -c:v copy -c:a copy -n '{CACHE_FOLDER}/{name_prefix}-{order}.mp4' "
                f"-hide_banner -loglevel error -stats"
            )

    else:
        video_download_command = (
            f".\\N_m3u8DL-RE.exe '{url}' --tmp-dir './{CACHE_FOLDER}' "
            f"--save-dir '{save_dir}' --save-name '{save_name}' -M format=mp4 "
            f"--check-segments-count false --download-retry-count 15 --thread-count 64"
        )

    if WINDOWS:
        result = subprocess.run(['powershell', '-Command', video_download_command], text=True)
    else:
        result = subprocess.run(video_download_command, shell=True)

    return result.returncode


def download_segments_in_parallel(idm_flag, fallback_flag, CACHE_FOLDER, lesson_video_data, name_prefix):
    has_error = False

    # MOOC TYPE
    if fallback_flag == 2:
        # Create a ThreadPoolExecutor to manage parallel downloads
        with ThreadPoolExecutor(max_workers=6) as executor:
            # Dictionary to hold future results
            future_to_order = {}

            for order, url in enumerate(lesson_video_data):
                if idm_flag:
                    future = executor.submit(download_segment_idm, CACHE_FOLDER, url, order, name_prefix)
                else:
                    future = executor.submit(download_segment, CACHE_FOLDER, url, order, name_prefix)

                # Store the future and order for tracking
                future_to_order[future] = order

                # Add a 1-second interval between submissions
                time.sleep(1)

            # Iterate over the completed futures
            for future in as_completed(future_to_order):
                order = future_to_order[future]
                try:
                    result = future.result()  # Get the result (will raise exception if there was one)
                    if not idm_flag and result != 0:
                        print(f"Failed to download {name_prefix} - {order}, downloader returned {result}", file=sys.stderr)
                        has_error = True
                    else:
                        print(f"Successfully downloaded {name_prefix} - {order}")
                except Exception:
                    print(traceback.format_exc())
                    print(f"Failed to download {name_prefix} - {order}", file=sys.stderr)
                    has_error = True

    # v1 type
    elif fallback_flag == 1:
        # Create a ThreadPoolExecutor to manage parallel downloads
        with ThreadPoolExecutor(max_workers=6) as executor:
            # Dictionary to hold future results
            future_to_order = {}

            for order, segment in enumerate(lesson_video_data['data']['live_timeline']):
                replay_url = segment['replay_url']

                # Determine which function to use based on the presence of 'm3u8' in the replay_url
                if 'm3u8' in replay_url:
                    future = executor.submit(download_segment_m3u8, idm_flag, CACHE_FOLDER, replay_url, order,
                                             name_prefix,
                                             max_retries=10)
                else:
                    if idm_flag:
                        future = executor.submit(download_segment_idm, CACHE_FOLDER, replay_url, order, name_prefix)
                    else:
                        future = executor.submit(download_segment, CACHE_FOLDER, replay_url, order, name_prefix)

                # Store the future and order for tracking
                future_to_order[future] = order

                # Add a 1-second interval between submissions
                time.sleep(1)

            # Iterate over the completed futures
            for future in as_completed(future_to_order):
                order = future_to_order[future]
                try:
                    result = future.result()  # Get the result (will raise exception if there was one)
                    if not idm_flag and result != 0:
                        print(f"Failed to download {name_prefix} - {order}, downloader returned {result}", file=sys.stderr)
                        has_error = True
                    else:
                        print(f"Successfully downloaded {name_prefix} - {order}")
                except Exception:
                    print(traceback.format_exc())
                    print(f"Failed to download {name_prefix} - {order}", file=sys.stderr)
                    has_error = True

    # v3 type
    else:
        # Create a ThreadPoolExecutor to manage parallel downloads
        with ThreadPoolExecutor(max_workers=6) as executor:
            # Dictionary to hold future results
            future_to_order = {}

            for order, segment in enumerate(lesson_video_data['data']['live']):
                url = segment['url']

                # Determine which function to use based on the presence of 'm3u8' in the URL
                if 'm3u8' in url:
                    future = executor.submit(download_segment_m3u8, idm_flag, CACHE_FOLDER, url, order, name_prefix,
                                             max_retries=10)
                else:
                    if idm_flag:
                        future = executor.submit(download_segment_idm, CACHE_FOLDER, url, order, name_prefix)
                    else:
                        future = executor.submit(download_segment, CACHE_FOLDER, url, order, name_prefix)

                # Store the future and order for tracking
                future_to_order[future] = order

                # Add a 1-second interval between submissions
                time.sleep(1)

            # Iterate over the completed futures
            for future in as_completed(future_to_order):
                order = future_to_order[future]
                try:
                    result = future.result()  # Get the result (will raise exception if there was one)
                    if not idm_flag and result != 0:
                        print(f"Failed to download {name_prefix} - {order}, downloader returned {result}", file=sys.stderr)
                        has_error = True
                    else:
                        print(f"Successfully downloaded {name_prefix} - {order}")
                except Exception:
                    print(traceback.format_exc())
                    print(f"Failed to download {name_prefix} - {order}", file=sys.stderr)
                    has_error = True

    if has_error:
        raise Exception("Failed to download some video segments.")


def concatenate_segments(CACHE_FOLDER, DOWNLOAD_FOLDER, name_prefix, num_segments, hw_decoding_flag):
    # Create the concat file with segment paths
    with open(f"{CACHE_FOLDER}/concat.txt", "w", encoding='utf-8') as f:
        for i in range(num_segments):
            video_file_mp4 = f"../{CACHE_FOLDER}/{name_prefix}-{i}.mp4"
            video_file_ts = f"../{CACHE_FOLDER}/{name_prefix}-{i}.ts"
            if os.path.exists(os.path.join(CACHE_FOLDER, f"{name_prefix}-{i}.mp4")):  # Check if the file exists
                f.write(f"file '{video_file_mp4}'\n")
            if os.path.exists(os.path.join(CACHE_FOLDER, f"{name_prefix}-{i}.ts")):  # Check if the file exists
                f.write(f"file '{video_file_ts}'\n")

    target_file = os.path.join(DOWNLOAD_FOLDER, f"{name_prefix}.mp4")
    if os.path.exists(target_file):
        print(f"Skipping '{DOWNLOAD_FOLDER}/{name_prefix}.mp4' - Video already present")
        time.sleep(0.25)
        return target_file

    # First video concatenation command using CUDA acceleration
    if hw_decoding_flag:
        video_concatenating_command = (
            f"{FFMPEG_PATH} -hwaccel cuda -f concat -safe 0 "
            f"-i '{CACHE_FOLDER}/concat.txt' "
            f"-c:v av1_nvenc -cq 36 -g 200 -bf 7 -b_strategy 1 -sc_threshold 80 -me_range 16  "
            f"-surfaces 64 -bufsize 12800k -refs 16 -r 7.5 -temporal-aq 1 -rc-lookahead 127 "
            f"-c:a aac -ac 1 -rematrix_maxval 1.0 -b:a 64k '{DOWNLOAD_FOLDER}/{name_prefix}.mp4' -n "
            f"-hide_banner -loglevel error -stats"
        )
    else:
        video_concatenating_command = (
            f"{FFMPEG_PATH} -f concat -safe 0 "
            f"-i '{CACHE_FOLDER}/concat.txt' "
            f"-c:v av1_nvenc -cq 36 -g 200 -bf 7 -b_strategy 1 -sc_threshold 80 -me_range 16  "
            f"-surfaces 64 -bufsize 12800k -refs 16 -r 7.5 -temporal-aq 1 -rc-lookahead 127 "
            f"-c:a aac -ac 1 -rematrix_maxval 1.0 -b:a 64k '{DOWNLOAD_FOLDER}/{name_prefix}.mp4' -n "
            f"-hide_banner -loglevel error -stats"
        )

    # Run the first command
    if WINDOWS:
        result = subprocess.run(['powershell', '-Command', video_concatenating_command], text=True)
    else:
        result = subprocess.run(video_concatenating_command, shell=True)

    # If the first command fails, try the fallback
    if result.returncode != 0:
        print(f"First attempt failed. Attempting fallback with software decoding.")

        # Fallback video concatenation command using cuvid acceleration
        video_concatenating_command_fallback = (
            f"{FFMPEG_PATH} -f concat -safe 0 "
            f"-i '{CACHE_FOLDER}/concat.txt' "
            f"-c:v av1_nvenc -cq 36 -g 200 -bf 7 -b_strategy 1 -sc_threshold 80 -me_range 16 "
            f"-surfaces 64 -bufsize 12800k -refs 16 -r 7.5 -temporal-aq 1 -rc-lookahead 127 "
            f"-c:a copy '{DOWNLOAD_FOLDER}/{name_prefix}.mp4' -y "
            f"-hide_banner -loglevel error -stats -err_detect ignore_err -fflags +discardcorrupt"
        )

        # Run the fallback command
        if WINDOWS:
            fallback_result = subprocess.run(['powershell', '-Command', video_concatenating_command_fallback], text=True)
        else:
            fallback_result = subprocess.run(video_concatenating_command_fallback, shell=True)

        # Check if the fallback also fails
        if fallback_result.returncode != 0:
            print(f"Both attempts failed to concatenate video segments.")
        else:
            print(f"Successfully concatenated video segments.")
    else:
        print(f"Successfully concatenated video segments using CUDA acceleration.")

    return result