diff --git a/.gitignore b/.gitignore index 83d2690..30b37b0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ .idea/ /data/ /cache/ +*.log +__pycache__ +ffmpeg.exe diff --git a/N_m3u8DL-RE.exe b/N_m3u8DL-RE.exe new file mode 100644 index 0000000..b23b9e8 --- /dev/null +++ b/N_m3u8DL-RE.exe Binary files differ diff --git a/README.md b/README.md new file mode 100644 index 0000000..9f9ca91 --- /dev/null +++ b/README.md @@ -0,0 +1,35 @@ +# rainclassroom-scrapper + +environment: +`conda env create -f conda_environment.yml` + +requirements: +- Python >= 3.12 +- requests +- websocket-client (qrcode login) +- qrcode (qrcode login) +- Pillow (Add answer to problem; Convert PPT to PDF) + +required system binaries: +- aria2c (Download files multi-threaded & resume support) +- ffmpeg with nvenc support (Concatenate video segments and convert to HEVC) + +usage: `main_windows.py [-h] [-c SESSION_COOKIE] [-y YKT_HOST] [--video] [--ppt] [--ppt-to-pdf] [--ppt-problem-answer] + [--course-name-filter COURSE_NAME_FILTER] [--lesson-name-filter LESSON_NAME_FILTER]` + +options: +``` +-h, --help Show this help message and exit +-c SESSION_COOKIE, --session-cookie SESSION_COOKIE + Session Cookie +-y YKT_HOST, --ykt-host YKT_HOST + RainClassroom Host +--video Download Video +--ppt Download PPT +--ppt-to-pdf Convert PPT to PDF +--ppt-problem-answer Store PPT Problem Answer +--course-name-filter COURSE_NAME_FILTER + Filter Course Name +--lesson-name-filter LESSON_NAME_FILTER + Filter Lesson Name +``` diff --git a/aria2c.exe b/aria2c.exe new file mode 100644 index 0000000..5004e10 --- /dev/null +++ b/aria2c.exe Binary files differ diff --git a/conda_environment.yml b/conda_environment.yml new file mode 100644 index 0000000..6413382 --- /dev/null +++ b/conda_environment.yml @@ -0,0 +1,61 @@ +name: rainclassroom +channels: + - defaults + - conda-forge +dependencies: + - altgraph=0.17.3=py312haa95532_0 + - brotli-python=1.0.9=py312hd77b12b_8 + - bzip2=1.0.8=h2bbff1b_6 + - ca-certificates=2024.9.24=haa95532_0 + - certifi=2024.8.30=py312haa95532_0 + - cffi=1.17.1=py312h827c3e9_0 + - charset-normalizer=3.3.2=pyhd3eb1b0_0 + - colorama=0.4.6=py312haa95532_0 + - expat=2.6.3=h5da7b33_0 + - freetype=2.12.1=ha860e81_0 + - gevent=23.9.1=py312h2bbff1b_0 + - greenlet=3.0.1=py312hd77b12b_0 + - idna=3.7=py312haa95532_0 + - jpeg=9e=h827c3e9_3 + - lcms2=2.12=h83e58a3_0 + - lerc=3.0=hd77b12b_0 + - libdeflate=1.17=h2bbff1b_1 + - libffi=3.4.4=hd77b12b_1 + - libpng=1.6.39=h8cc25b3_0 + - libtiff=4.5.1=hd77b12b_0 + - libwebp-base=1.3.2=h3d04722_1 + - lz4-c=1.9.4=h2bbff1b_1 + - openjpeg=2.5.2=hae555c5_0 + - openssl=3.0.15=h827c3e9_0 + - packaging=24.1=py312haa95532_0 + - pefile=2023.2.7=py312haa95532_0 + - pillow=10.4.0=py312h827c3e9_0 + - pip=24.2=py312haa95532_0 + - pycparser=2.21=pyhd3eb1b0_0 + - pyinstaller=6.9.0=py312h0416ee5_0 + - pyinstaller-hooks-contrib=2024.7=py312haa95532_0 + - pypng=0.20220715.0=py312haa95532_0 + - pysocks=1.7.1=py312haa95532_0 + - python=3.12.7=h14ffc60_0 + - pywin32-ctypes=0.2.2=py312haa95532_0 + - qrcode=7.4.2=py312haa95532_0 + - requests=2.32.3=py312haa95532_0 + - setuptools=75.1.0=py312haa95532_0 + - sqlite=3.45.3=h2bbff1b_0 + - tk=8.6.14=h0416ee5_0 + - typing-extensions=4.11.0=py312haa95532_0 + - typing_extensions=4.11.0=py312haa95532_0 + - tzdata=2024b=h04d1e81_0 + - urllib3=2.2.3=py312haa95532_0 + - vc=14.40=h2eaa2aa_1 + - vs2015_runtime=14.40.33807=h98bb1dd_1 + - websocket-client=1.8.0=py312haa95532_0 + - wheel=0.44.0=py312haa95532_0 + - win_inet_pton=1.1.0=py312haa95532_0 + - xz=5.4.6=h8cc25b3_1 + - zlib=1.2.13=h8cc25b3_1 + - zope=1.0=py312haa95532_1 + - zope.event=5.0=py312haa95532_0 + - zope.interface=5.4.0=py312h2bbff1b_0 + - zstd=1.5.6=h8880b57_0 +prefix: '%userprofile%\.conda\envs\rainclassroom' diff --git a/ffmpeg_extract_before_use.7z b/ffmpeg_extract_before_use.7z new file mode 100644 index 0000000..f65e1b4 --- /dev/null +++ b/ffmpeg_extract_before_use.7z Binary files differ diff --git a/main.py b/main.py index 556f2bd..f9dec6d 100644 --- a/main.py +++ b/main.py @@ -1,27 +1,43 @@ +# -*- coding: utf-8 -*- + import os -import signal import subprocess import sys - import argparse import time -from multiprocessing.pool import ThreadPool +import re +import traceback +import option +import shutil + +if sys.platform == 'win32': + os.system('chcp 65001') parser = argparse.ArgumentParser(add_help=False) parser.add_argument("-h", "--help", action="store_true", help="Show this help message and exit") parser.add_argument("-c", "--session-cookie", help="Session Cookie", required=False) parser.add_argument("-y", "--ykt-host", help="RainClassroom Host", required=False, default="pro.yuketang.cn") -parser.add_argument("--video", action="store_true", help="Download Video") -parser.add_argument("--ppt", action="store_true", help="Download PPT") -parser.add_argument("--ppt-to-pdf", action="store_true", help="Convert PPT to PDF", default=True) -parser.add_argument("--ppt-problem-answer", action="store_true", help="Store PPT Problem Answer", default=True) +parser.add_argument("-i", "--idm", action="store_true", help="Use IDMan.exe") +parser.add_argument("-ni", "--no-idm", action="store_true", help="Don't use IDMan.exe, implied when the system is not Windows") +parser.add_argument("-a", "--all", action="store_true", help="Download all content without asking") +parser.add_argument("-na", "--no-all", action="store_true", help="Ask before downloading each course") +parser.add_argument("-nv", "--no-video", action="store_true", help="Don't Download Video") +parser.add_argument("-np", "--no-ppt", action="store_true", help="Don't Download PPT") +parser.add_argument("-npc", "--no-convert-ppt-to-pdf", action="store_true", help="Don't Convert PPT to PDF") +parser.add_argument("-npa", "--no-ppt-answer", action="store_true", help="Don't Store PPT Problem Answer") parser.add_argument("--course-name-filter", action="store", help="Filter Course Name", default=None) parser.add_argument("--lesson-name-filter", action="store", help="Filter Lesson Name", default=None) args = parser.parse_args() -if args.help: +args.__setattr__('video', not args.no_video) +args.__setattr__('ppt', not args.no_ppt) +args.__setattr__('ppt_to_pdf', not args.no_convert_ppt_to_pdf) +args.__setattr__('ppt_problem_answer', not args.no_ppt_answer) + +# Check if no arguments are provided or only --help is provided +if args.help or len(sys.argv) == 1: print("""RainClassroom Video Downloader requirements: @@ -30,19 +46,89 @@ - websocket-client (qrcode login) - qrcode (qrcode login) - Pillow (Add answer to problem; Convert PPT to PDF) - -required system binaries: + - aria2c (Download files multi-threaded & resume support) - ffmpeg with nvenc support (Concatenate video segments and convert to HEVC) """) - print(parser.format_help()) + if sys.platform == 'win32': + print('\nYOU SHALL RUN THIS EXECUTABLE FROM POWERSHELL WITH ARGUMENT!!') + os.system('pause') + exit() +# Check for dependencies +try: + import requests +except ImportError: + print("requests is not installed. Please install it using 'pip install requests'", file=sys.stderr) + exit(1) + +if args.session_cookie is None: + try: + import websocket + except ImportError: + print("websocket-client is not installed. Please install it using 'pip install websocket-client'", file=sys.stderr) + exit(1) + + try: + import qrcode + except ImportError: + print("qrcode is not installed. Please install it using 'pip install qrcode'", file=sys.stderr) + exit(1) + +if args.ppt_to_pdf or args.ppt_problem_answer: + try: + import PIL + except ImportError: + print("PIL is not installed. Please install it using 'pip install pillow'", file=sys.stderr) + exit(1) + +if args.all and args.no_all: + print("'-a' and '-na' cannot be used together") +if args.idm and args.no_idm: + print("'-idm' and '-no_idm' cannot be used together") + +if args.all: + allin_flag = 1 +elif args.no_all: + allin_flag = 0 +else: + allin_flag = option.ask_for_allin() + +if sys.platform != 'win32': + print("Inferring --no-idm flag as the system is not Windows") + args.no_idm = True + +if args.idm: + idm_flag = 1 +elif args.no_idm: + idm_flag = 0 +else: + idm_flag = option.ask_for_idm() + +if idm_flag and shutil.which('IDMan.exe') is None: + print("IDMan.exe is not found. Please install IDM and add it to PATH, or specify '--no-idm' flag", file=sys.stderr) + exit(1) + +if idm_flag and sys.platform != 'win32': + print("WARNING: Are you sure that you want to use IDM on a non-Windows system?", file=sys.stderr) + +args.__setattr__("aria2c_path", "aria2c") +if shutil.which("aria2c") is None and os.path.exists("aria2c.exe"): + args.__setattr__("aria2c_path", os.path.join(os.getcwd(), "aria2c")) + print(f"aria2c is not found in PATH, using local binary at {args.aria2c_path}") + +if not idm_flag: + if shutil.which(args.aria2c_path) is None: + print("aria2c is not found. Please install aria2 and add it to PATH, or use IDM instead", file=sys.stderr) + exit(1) + + print("IDM is not enabled, aria2c will be used for downloading") + import requests import json -import tempfile # --- --- --- Section Init --- --- --- # # Login to RainClassroom @@ -56,9 +142,6 @@ os.makedirs(DOWNLOAD_FOLDER, exist_ok=True) os.makedirs(CACHE_FOLDER, exist_ok=True) -pool = ThreadPool(4) -interrupted = False - # --- --- --- Section Load Session --- --- --- # if args.session_cookie is not None: @@ -69,6 +152,7 @@ import websocket import qrcode + def on_message(ws, message): global userinfo userinfo = json.loads(message) @@ -91,6 +175,7 @@ def on_open(ws): ws.send(data=json.dumps({"op": "requestlogin", "role": "web", "version": 1.4, "type": "qrcode", "from": "web"})) + # websocket数据交互 ws = websocket.WebSocketApp(f"wss://{YKT_HOST}/wsapp/", on_message=on_message, @@ -104,9 +189,8 @@ data=json.dumps({'UserID': userinfo['UserID'], 'Auth': userinfo['Auth']})) # Store session - with open(f"{DOWNLOAD_FOLDER}/session.txt", "w") as f: - f.write(rainclassroom_sess.cookies['sessionid']) - + with open(f"{DOWNLOAD_FOLDER}/session.txt", "a", encoding='utf-8') as f: + f.write(rainclassroom_sess.cookies['sessionid'] + "\n") # --- --- --- Section Get Course List --- --- --- # @@ -127,44 +211,20 @@ # --- --- --- Section Get Lesson List --- --- --- # -# { -# "university_name": "", -# "term": 202401, -# "university_logo_pic": "", -# "name": "NAME", -# "type_count": [], -# "students_count": 7, -# "color_system": 3, -# "course": { -# "update_time": "", -# "name": "", -# "admin_id": 0, -# "university_id": 0, -# "type": 0, -# "id": 0 -# }, -# "teacher": { -# "user_id": 0, -# "name": "", -# "avatar": "" -# }, -# "create_time": "", -# "university_id": 0, -# "time": "", -# "course_id": 0, -# "university_logo": "0", -# "university_mini_logo": "0", -# "id": 0, -# "is_pro": true, -# "color_code": 0 -# } -def get_lesson_list(course: dict, TEMP_FOLDER: str, name_prefix: str = ""): +def get_lesson_list(course: dict, name_prefix: str = ""): lesson_data = rainclassroom_sess.get( - f"https://{YKT_HOST}/v2/api/web/logs/learn/{course['classroom_id']}?actype=14&page=0&offset=500&sort=-1").json() + f"https://{YKT_HOST}/v2/api/web/logs/learn/{course['classroom_id']}?actype=-1&page=0&offset=500&sort=-1").json() folder_name = f"{course['name']}-{course['teacher']['name']}" + folder_name = option.windows_filesame_sanitizer(folder_name) + + if idm_flag: + folder_name = folder_name.replace('/', '\\') + folder_name = re.sub(r'[“”]', '_', folder_name) + + print('folder name would be:',folder_name) # Rename old folder if os.path.exists(f"{DOWNLOAD_FOLDER}/{course['name']}"): @@ -175,432 +235,474 @@ os.makedirs(f"{DOWNLOAD_FOLDER}/{folder_name}", exist_ok=True) os.makedirs(f"{CACHE_FOLDER}/{folder_name}", exist_ok=True) - name_prefix += folder_name + "/" + + + name_prefix += folder_name.rstrip() + "/" + name_prefix = option.windows_filesame_sanitizer(name_prefix) if args.lesson_name_filter is not None: - lesson_data['data']['activities'] = [l for l in lesson_data['data']['activities'] if args.lesson_name_filter in l['title']] + lesson_data['data']['activities'] = [l for l in lesson_data['data']['activities'] if + args.lesson_name_filter in l['title']] length = len(lesson_data['data']['activities']) if args.video: for index, lesson in enumerate(lesson_data['data']['activities']): - if interrupted: - return + if not lesson['type'] in [2, 14, 15, 17]: + continue + + lesson['classroom_id'] = course['classroom_id'] # Lesson try: - download_lesson_video(lesson, TEMP_FOLDER, name_prefix + str(length - index)) - except Exception as e: - print(e) + if lesson['type'] == 2: + print('Script type detected!') + download_lesson_video_type2(lesson, name_prefix + str(length - index)) + elif lesson['type'] == 14: + print('Normal type detected!') + download_lesson_video(lesson, name_prefix + str(length - index)) + elif lesson['type'] == 15: + print('MOOCv2 type detected!') + download_lesson_video_type15(lesson, name_prefix + str(length - index)) + elif lesson['type'] == 17: + print('MOOCv1 type detected!') + download_lesson_video_type17(lesson, name_prefix + str(length - index)) + except Exception: + print(traceback.format_exc()) + print(f"Failed to download video for {name_prefix} - {lesson['title']}", file=sys.stderr) + + print('sbykt may not prepare cold data in one run, rescanning for missing ones') + + for index, lesson in enumerate(lesson_data['data']['activities']): + if not lesson['type'] in [14, 15, 17]: + continue + + lesson['classroom_id'] = course['classroom_id'] + + # Lesson + try: + if lesson['type'] == 14: + print('Normal type detected!') + download_lesson_video(lesson, name_prefix + str(length - index)) + elif lesson['type'] == 15: + print('MOOCv2 type detected!') + download_lesson_video_type15(lesson, name_prefix + str(length - index)) + elif lesson['type'] == 17: + print('MOOCv1 type detected!') + download_lesson_video_type17(lesson, name_prefix + str(length - index)) + except Exception: + print(traceback.format_exc()) print(f"Failed to download video for {name_prefix} - {lesson['title']}", file=sys.stderr) if args.ppt: for index, lesson in enumerate(lesson_data['data']['activities']): - if interrupted: - return + if lesson['type'] in (15, 17): + print("mooc type has no ppts!") + continue + lesson['classroom_id'] = course['classroom_id'] # Lesson try: - download_lesson_ppt(lesson, TEMP_FOLDER, name_prefix + str(length - index)) - except Exception as e: - print(e) + download_lesson_ppt(lesson, name_prefix + str(length - index)) + except Exception: + print(traceback.format_exc()) print(f"Failed to download PPT for {name_prefix} - {lesson['title']}", file=sys.stderr) -# --- --- --- Section Popen --- --- --- # + print('sbykt may not prepare cold data in one run, rescanning for missing ones') + for index, lesson in enumerate(lesson_data['data']['activities']): + if lesson['type'] in (15, 17): + print("mooc type has no ppts!") + continue + lesson['classroom_id'] = course['classroom_id'] -def popen(cmd: str, interrupt, fail_msg: str): - print("Start:", cmd) - pcs = subprocess.Popen(cmd, shell=True, stdout=sys.stdout, stderr=sys.stderr) + # Lesson + try: + download_lesson_ppt(lesson, name_prefix + str(length - index)) + except Exception: + print(traceback.format_exc()) + print(f"Failed to download PPT for {name_prefix} - {lesson['title']}", file=sys.stderr) - while pcs.poll() is None: - if interrupted: - interrupt(pcs) - - if pcs.poll() is None: - pcs.send_signal(signal.SIGTERM) - time.sleep(0.5) - if pcs.poll() is None: - pcs.send_signal(signal.SIGKILL) - - raise KeyboardInterrupt() - - time.sleep(0.5) - - print("End:", cmd) - - if pcs.wait() != 0: - raise Exception(fail_msg) - - -def aria2c_interrupt(pcs): - pcs.send_signal(signal.SIGINT) - - while pcs.poll() is None: - time.sleep(0.5) # --- --- --- Section Download Lesson Video --- --- --- # -# { -# "type": 14, -# "id": 7153416, -# "courseware_id": "909642544544463488", -# "title": "R8-三相-周期非正弦", -# "create_time": 1686274642000, -# "attend_status": true, -# "is_finished": true -# } + +from video_processing import download_segments_in_parallel, concatenate_segments -def download_lesson_video(lesson: dict, TEMP_FOLDER, name_prefix: str = ""): +def download_lesson_video(lesson: dict, name_prefix: str = ""): lesson_video_data = rainclassroom_sess.get( f"https://{YKT_HOST}/api/v3/lesson-summary/replay?lesson_id={lesson['courseware_id']}").json() - name_prefix += "-" + lesson['title'] + + name_prefix += "-" + lesson['title'].rstrip() + name_prefix = option.windows_filesame_sanitizer(name_prefix) + + if idm_flag: + name_prefix = re.sub(r'[“”]', '_', name_prefix) if 'live' not in lesson_video_data['data']: - print(f"Skipping {name_prefix} - No Video", file=sys.stderr) - return + print(f"v3 protocol detection failed, falling back to v1") + + fallback_flag = 1 + + lesson_video_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/v/lesson/get_lesson_replay_timeline/?lesson_id={lesson['courseware_id']}").json() + + if 'live_timeline' not in lesson_video_data['data'] or len(lesson_video_data['data']['live_timeline']) == 0: + print(f"Skipping {name_prefix} - No Video", file=sys.stderr) + return + else: + fallback_flag = 0 + + if len(lesson_video_data['data']['live']) == 0: + print(f"Skipping {name_prefix} - No Video", file=sys.stderr) + return if os.path.exists(f"{DOWNLOAD_FOLDER}/{name_prefix}.mp4"): print(f"Skipping {name_prefix} - Video already present") - time.sleep(0.5) + time.sleep(0.25) return has_error = False - for order, segment in enumerate(lesson_video_data['data']['live']): - if interrupted: - return + # Download segments in parallel + try: + download_segments_in_parallel(idm_flag, fallback_flag, CACHE_FOLDER, lesson_video_data, name_prefix) + except Exception: + print(traceback.format_exc()) + print(f"Failed to download {name_prefix}", file=sys.stderr) + has_error = True - # Segment - try: - download_segment(segment['url'], order, name_prefix) - except Exception as e: - print(e) - print(f"Failed to download {name_prefix} - {segment['order']}", file=sys.stderr) - has_error = True - - if not has_error and len(lesson_video_data['data']['live']) > 0: - print(f"Concatenating {name_prefix}") - - ffmpeg_input_file = f"{TEMP_FOLDER}/concat.txt" - - # Get absolute path of the video files - cache_absolute = os.path.abspath(f"{CACHE_FOLDER}") - - with open(ffmpeg_input_file, "w") as f: - f.write("\n".join( - [f"file '{cache_absolute}/{name_prefix}-{i}.mp4'" for i in range(len(lesson_video_data['data']['live']))] - )) - - cmd = f"ffmpeg -f concat -safe 0 -hwaccel cuda -hwaccel_output_format cuda -i {ffmpeg_input_file} -c:v hevc_nvenc -b:v 200k -maxrate 400k -bufsize 3200k -r 8 -rc-lookahead 1024 -c:a aac -rematrix_maxval 1.0 -ac 1 -b:a 64k '{DOWNLOAD_FOLDER}/{name_prefix}.mp4' -n -hide_banner -loglevel warning -stats" - - def ffmpeg_interrupt(pcs): - # Interrupt and kill ffmpeg, delete the incomplete file - pcs.send_signal(signal.SIGINT) - time.sleep(0.5) - pcs.send_signal(signal.SIGKILL) - time.sleep(0.3) - os.remove(f"{DOWNLOAD_FOLDER}/{name_prefix}.mp4") - - popen(cmd, ffmpeg_interrupt, f"Failed to concatenate {name_prefix}") + # Start concatenation if downloads were successful + if not has_error: + time.sleep(1) + if 'live' in lesson_video_data['data'] and len(lesson_video_data['data']['live']) > 0: + print(f"Concatenating {name_prefix}") + concatenate_segments(CACHE_FOLDER, DOWNLOAD_FOLDER, name_prefix, len(lesson_video_data['data']['live'])) + elif 'live_timeline' in lesson_video_data['data'] and len(lesson_video_data['data']['live_timeline']) > 0: + print(f"Concatenating {name_prefix}") + concatenate_segments(CACHE_FOLDER, DOWNLOAD_FOLDER, name_prefix, + len(lesson_video_data['data']['live_timeline'])) + else: + print('concatenate cannot start due to previous failure') + else: + print('concatenate cannot start due to previous failure') if has_error: with open(f"{DOWNLOAD_FOLDER}/error.log", "a") as f: f.write(f"{name_prefix}\n") -# --- --- --- Section Download Segment --- --- --- # -# { -# "id": "743834725938342272", -# "code": "kszt_DdQU9sOod7o", -# "type": 2, -# "source": "th", -# "url": "https://kszt-playback.xuetangx.com/gifshow-xuetangx/73466bdb387702307504996781/f0.mp4?auth_key=1729778852-4128559473511008914-0-e0c959d1504f92ef5a5d45000f46330d", -# "start": 1666508813000, -# "end": 1666510612000, -# "duration": 1799000, -# "hiddenStatus": 0, -# "order": 0, -# "replayOssStatus": 0, -# "recordFileId": "", -# "recordType": "", -# "subtitlePath": "" -# } +def download_lesson_video_type15(lesson: dict, name_prefix: str = ""): + mooc_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/c27/online_courseware/xty/kls/pub_news/{lesson['courseware_id']}/", + headers={ + "Xtbz": "ykt", + "Classroom-Id": str(lesson['classroom_id']) + } + ).json() + + for chapter in mooc_data['data']['content_info']: + chapter_name = chapter['name'] + + for orphan in chapter['leaf_list']: + orphan_title = orphan['title'] + orphan_id = orphan['id'] + has_error = False + + name_prefix_orphan = name_prefix + chapter_name + " - " + orphan_title + name_prefix_orphan = option.windows_filesame_sanitizer(name_prefix_orphan) + + if idm_flag: + name_prefix_orphan = re.sub(r'[“”]', '_', name_prefix_orphan) + + mooc_orphan_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/mooc-api/v1/lms/learn/leaf_info/{str(lesson['classroom_id'])}/{str(orphan_id)}/", + headers={ + "Xtbz": "ykt", + "Classroom-Id": str(lesson['classroom_id']) + } + ).json() + + if 'data' not in mooc_orphan_data or 'content_info' not in mooc_orphan_data['data']: + print('no media detected, skipping!') + continue + + mooc_orphan_media_id = mooc_orphan_data['data']['content_info']['media']['ccid'] + mooc_orphan_media_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/api/open/audiovideo/playurl?video_id={mooc_orphan_media_id}&provider=cc&is_single=0&format=json" + ).json() + + quality_keys = list(map(lambda x: (int(x[7:]), x), mooc_orphan_media_data['data']['playurl']['sources'].keys())) + quality_keys.sort(key=lambda x: x[0], reverse=True) + download_url_list = mooc_orphan_media_data['data']['playurl']['sources'][quality_keys[0][1]] + # print(download_url_list) + + # Download segments in parallel + try: + download_segments_in_parallel(idm_flag, 2, CACHE_FOLDER, download_url_list, name_prefix_orphan) + except Exception: + print(traceback.format_exc()) + print(f"Failed to download {name_prefix}", file=sys.stderr) + has_error = True + + # Start concatenation if downloads were successful + if not has_error: + time.sleep(0.25) + if 'playurl' in mooc_orphan_media_data['data'] and len(download_url_list) > 0: + print(f"Concatenating {name_prefix}") + concatenate_segments(CACHE_FOLDER, DOWNLOAD_FOLDER, name_prefix_orphan, len(download_url_list)) + else: + print('concatenate cannot start due to previous failure') + else: + print('concatenate cannot start due to previous failure') + + if has_error: + with open(f"{DOWNLOAD_FOLDER}/error.log", "a") as f: + f.write(f"{name_prefix}\n") + + for section in chapter['section_list']: + section_name = section['name'] + + for lesson_d in section['leaf_list']: + lesson_name = lesson_d['title'] + lesson_id = lesson_d['id'] + has_error = False + + name_prefix_lesson = name_prefix + chapter_name + " - " + section_name + " - " + lesson_name + name_prefix_lesson = option.windows_filesame_sanitizer(name_prefix_lesson) + + if idm_flag: + name_prefix_lesson = re.sub(r'[“”]', '_', name_prefix_lesson) + + mooc_lesson_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/mooc-api/v1/lms/learn/leaf_info/{str(lesson['classroom_id'])}/{str(lesson_id)}/", + headers={ + "Xtbz": "ykt", + "Classroom-Id": str(lesson['classroom_id']) + } + ).json() + + if 'data' not in mooc_lesson_data or 'content_info' not in mooc_lesson_data['data']: + print('no media detected, skipping!') + continue + + mooc_media_id = mooc_lesson_data['data']['content_info']['media']['ccid'] + + mooc_media_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/api/open/audiovideo/playurl?video_id={mooc_media_id}&provider=cc&is_single=0&format=json" + ).json() + + quality_keys = list(map(lambda x: (int(x[7:]), x), mooc_media_data['data']['playurl']['sources'].keys())) + quality_keys.sort(key=lambda x: x[0], reverse=True) + download_url_list = mooc_media_data['data']['playurl']['sources'][quality_keys[0][1]] + # print(download_url_list) + + # Download segments in parallel + try: + download_segments_in_parallel(idm_flag, 2, CACHE_FOLDER, download_url_list, name_prefix_lesson) + except Exception: + print(traceback.format_exc()) + print(f"Failed to download {name_prefix}", file=sys.stderr) + has_error = True + + # Start concatenation if downloads were successful + if not has_error: + time.sleep(1) + if 'playurl' in mooc_media_data['data'] and len(download_url_list) > 0: + print(f"Concatenating {name_prefix}") + concatenate_segments(CACHE_FOLDER, DOWNLOAD_FOLDER, name_prefix_lesson, len(download_url_list)) + else: + print('concatenate cannot start due to previous failure') + else: + print('concatenate cannot start due to previous failure') + + if has_error: + with open(f"{DOWNLOAD_FOLDER}/error.log", "a") as f: + f.write(f"{name_prefix}\n") -def download_segment(url: str, order: int, name_prefix: str = ""): - print(f"Downloading {name_prefix} - {order}") - cmd = f"aria2c -o '{CACHE_FOLDER}/{name_prefix}-{order}.mp4' -x 4 -s 2 '{url}' -c --log-level warn" +def download_lesson_video_type17(lesson: dict, name_prefix: str = ""): + mooc_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/c27/online_courseware/xty/kls/pub_news/{lesson['courseware_id']}/", + headers={ + "Xtbz": "ykt", + "Classroom-Id": str(lesson['classroom_id']) + } + ).json() - popen(cmd, aria2c_interrupt, f"Failed to download {name_prefix}-{order}") + if 'name' not in mooc_data['data']['content_info'] or 'content_info' not in mooc_data['data']: + print('no media detected, skipping!') + return -# --- --- --- Section Download Lesson PPT --- --- --- # -# { -# "code": 0, -# "msg": "OK", -# "data": { -# "lesson": { -# "id": "1267751345493205504", -# "title": "", -# "startTime": 1728964537223, -# "endTime": 1728965834350, -# "teacherIdentityId": "15753469", -# "classroom": { -# "id": "3134428", -# "name": "", -# "pro": true -# }, -# "course": { -# "id": "1360043", -# "name": "计算机网络原理" -# } -# }, -# "fileSharing": { -# "count": 0, -# "cover": null -# }, -# "teacher": { -# "identityId": "15753469", -# "avatar": "0", -# "name": "徐明伟", -# "number": "1998990267" -# }, -# "replayType": 0, -# "replayOssStatus": 0, -# "presentations": [ -# { -# "id": "1267751453966295552", -# "title": "", -# "cover": "", -# "slidesCount": 21, -# "totalSlidesCount": 59, -# "doubtCount": 0, -# "collectCount": 0, -# "conf": "" -# } -# ], -# "user": { -# "identityId": "", -# "avatar": "", -# "name": "", -# "number": "" -# }, -# "activityId": "7970721", -# "memoContent": "", -# "liveViewed": false, -# "doubtSlides": [], -# "collectSlides": [], -# "checkIn": { -# "lessonId": "", -# "identityId": "21640720", -# "score": 1000, -# "source": 5, -# "time": 1728964549329, -# "valid": 1, -# "problemScore": 1000, -# "quizScore": -1, -# "duration": 0, -# "addScore": null, -# "redEnvelope": 0, -# "correctCount": 10, -# "incorrectCount": 4, -# "unMarkCount": 0 -# }, -# "quizzes": [], -# "danmuList": [], -# "tougaoList": [], -# "toastType": 0, -# "problems": [ -# { -# "problemId": "1267751453974684162", -# "problemType": 1, -# "problemScore": 100, -# "index": 3, -# "cover": "", -# "presentationId": "1267751453966295552", -# "answer": [ -# "D" -# ], -# "ans_type": "", -# "comment": {}, -# "correctAnswer": [ -# "D" -# ], -# "score": 100, -# "submitTime": 1728964586371, -# "scoreTime": 0, -# "correct": true, -# "blankStatus": [], -# "anonymous": null, -# "remarkDetail": {}, -# "teamInfo": null -# } -# ] -# } -# } + only_lesson_name = mooc_data['data']['content_info']['name'] + only_lesson_id = mooc_data['data']['content_info']['id'] + + has_error = False + + name_prefix_lesson = name_prefix + only_lesson_name + name_prefix_lesson = option.windows_filesame_sanitizer(name_prefix_lesson) + + if idm_flag: + name_prefix_lesson = re.sub(r'[“”]', '_', name_prefix_lesson) + + mooc_lesson_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/mooc-api/v1/lms/learn/leaf_info/{str(lesson['classroom_id'])}/{str(only_lesson_id)}/", + headers={ + "Xtbz": "ykt", + "Classroom-Id": str(lesson['classroom_id']) + } + ).json() + + if 'data' not in mooc_lesson_data or 'content_info' not in mooc_lesson_data['data']: + print('no media detected, skipping!') + return + + mooc_media_id = mooc_lesson_data['data']['content_info']['media']['ccid'] + + mooc_media_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/api/open/audiovideo/playurl?video_id={mooc_media_id}&provider=cc&is_single=0&format=json" + ).json() + + quality_keys = list(map(lambda x: (int(x[7:]), x), mooc_media_data['data']['playurl']['sources'].keys())) + quality_keys.sort(key=lambda x: x[0], reverse=True) + download_url_list = mooc_media_data['data']['playurl']['sources'][quality_keys[0][1]] + # print(download_url_list) + + # Download segments in parallel + try: + download_segments_in_parallel(idm_flag, 2, CACHE_FOLDER, download_url_list, name_prefix_lesson) + except Exception: + print(traceback.format_exc()) + print(f"Failed to download {name_prefix}", file=sys.stderr) + has_error = True + + # Start concatenation if downloads were successful + if not has_error: + time.sleep(1) + if 'playurl' in mooc_media_data['data'] and len(download_url_list) > 0: + print(f"Concatenating {name_prefix}") + concatenate_segments(CACHE_FOLDER, DOWNLOAD_FOLDER, name_prefix_lesson, len(download_url_list)) + else: + print('concatenate cannot start due to previous failure') + else: + print('concatenate cannot start due to previous failure') + + if has_error: + with open(f"{DOWNLOAD_FOLDER}/error.log", "a") as f: + f.write(f"{name_prefix}\n") -def download_lesson_ppt(lesson: dict, TEMP_FOLDER, name_prefix: str = ""): - lesson_data = rainclassroom_sess.get(f"https://{YKT_HOST}/api/v3/lesson-summary/student?lesson_id={lesson['courseware_id']}").json() - name_prefix += "-" + lesson['title'] +def download_lesson_video_type2(lesson: dict, name_prefix: str = ""): + # "id": 6036907, "courseware_id": "1055476" + # https://pro.yuketang.cn/v2/api/web/cards/detlist/1055476?classroom_id=3058049 + + lesson_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/v2/api/web/cards/detlist/{lesson['courseware_id']}?classroom_id={lesson['classroom_id']}").json() + name_prefix += "-" + lesson_data['data']['Title'].strip() + + name_prefix = option.windows_filesame_sanitizer(name_prefix) + + for slide in lesson_data['data']['Slides']: + slide_id = slide['PageIndex'] + for shape in slide['Shapes']: + if shape['ShapeType'] == 1 and 'file_title' in shape: + file_title = shape['file_title'] + quality_keys = list(map(lambda x: (int(x[7:]), x), shape['playurls'].keys())) + quality_keys.sort(key=lambda x: x[0], reverse=True) + download_url_list = shape['playurls'][quality_keys[0][1]] + + name_prefix_shape = name_prefix + f" - {slide_id} - {file_title}" + name_prefix_shape = option.windows_filesame_sanitizer(name_prefix_shape) + + if idm_flag: + name_prefix_shape = re.sub(r'[“”]', '_', name_prefix_shape) + + # Download segments in parallel + try: + download_segments_in_parallel(idm_flag, 2, CACHE_FOLDER, download_url_list, name_prefix_shape) + has_error = False + except Exception: + print(traceback.format_exc()) + print(f"Failed to download {name_prefix}", file=sys.stderr) + has_error = True + + # Start concatenation if downloads were successful + if not has_error: + time.sleep(1) + if 'playurl' in shape and len(download_url_list) > 0: + print(f"Concatenating {name_prefix}") + concatenate_segments(CACHE_FOLDER, DOWNLOAD_FOLDER, name_prefix_shape, len(download_url_list)) + else: + print('concatenate cannot start due to previous failure') + else: + print('concatenate cannot start due to previous failure') + + if has_error: + with open(f"{DOWNLOAD_FOLDER}/error.log", "a") as f: + f.write(f"{name_prefix}\n") + + +from ppt_processing import download_ppt + + +def download_lesson_ppt(lesson: dict, name_prefix: str = ""): + lesson_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/api/v3/lesson-summary/student?lesson_id={lesson['courseware_id']}").json() + name_prefix += "-" + lesson['title'].rstrip() + + name_prefix = option.windows_filesame_sanitizer(name_prefix) if 'presentations' not in lesson_data['data']: - print(f"Skipping {name_prefix} - No PPT", file=sys.stderr) - return + print(f"v3 protocol detection failed, falling back to v1") - for index, ppt in enumerate(lesson_data['data']['presentations']): - if interrupted: + ppt_info = rainclassroom_sess.get( + f"https://{YKT_HOST}/v2/api/web/lessonafter/{lesson['courseware_id']}/presentation?classroom_id={lesson['classroom_id']}").json() + if 'id' not in ppt_info['data'][0]: + print(f"Skipping {name_prefix} - No PPT", file=sys.stderr) return - # PPT - try: - download_ppt(lesson["courseware_id"], TEMP_FOLDER, ppt['id'], name_prefix + f"-{index}") - except Exception as e: - print(e) - print(f"Failed to download PPT {name_prefix} - {ppt['title']}", file=sys.stderr) + for index, ppt in enumerate(ppt_info['data']): + # PPT + try: + ppt_raw_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/v2/api/web/lessonafter/presentation/{ppt['id']}?classroom_id={lesson['classroom_id']}").json() + download_ppt(1, args.ppt_problem_answer, args.ppt_to_pdf, CACHE_FOLDER, DOWNLOAD_FOLDER, args.aria2c_path, + ppt_raw_data, name_prefix + f"-{index}") -# --- --- --- Section Download PPT --- --- --- # -# { -# "code": 0, -# "msg": "OK", -# "data": { -# "presentation": { -# "id": "714674183600571776", -# "title": "L1_课程介绍", -# "cover": "https://qn-st0.yuketang.cn/FudgWS2XoU3bXLxReeSBBhYTWJsX", -# "width": 720, -# "height": 540, -# "conf": { -# "show_presentation": "all", -# "slides": [ -# "714674183617348992" -# ], -# "hide_slides": [] -# } -# }, -# "slides": [ -# { -# "id": "714674183617348992", -# "index": 1, -# "doubtCount": 0, -# "collectCount": 0, -# "cover": "https://qn-st0.yuketang.cn/FudgWS2XoU3bXLxReeSBBhYTWJsX", -# "problem": null, -# "result": null -# } -# ] -# } -# } + except Exception as e: + print(traceback.format_exc()) + print(f"Failed to download PPT {name_prefix} - {ppt['title']}", file=sys.stderr) + else: + for index, ppt in enumerate(lesson_data['data']['presentations']): + # PPT + try: + ppt_raw_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/api/v3/lesson-summary/student/presentation?presentation_id={ppt['id']}&lesson_id={lesson['courseware_id']}").json() + download_ppt(3, args.ppt_problem_answer, args.ppt_to_pdf, CACHE_FOLDER, DOWNLOAD_FOLDER, args.aria2c_path, + ppt_raw_data, name_prefix + f"-{index}") -def download_ppt(lesson_id: str, TEMP_FOLDER, ppt_id: str, name_prefix: str = ""): - print(f"Downloading {name_prefix}") - ppt_raw_data = rainclassroom_sess.get(f"https://{YKT_HOST}/api/v3/lesson-summary/student/presentation?presentation_id={ppt_id}&lesson_id={lesson_id}").json() - name_prefix += "-" + ppt_raw_data['data']['presentation']['title'] - - # If PDF is present, skip - if os.path.exists(f"{DOWNLOAD_FOLDER}/{name_prefix}.pdf"): - print(f"Skipping {name_prefix} - PDF already present") - time.sleep(0.5) - return - - os.makedirs(f"{DOWNLOAD_FOLDER}/{name_prefix}", exist_ok=True) - - images = [] - - aria2_input_file = f"{TEMP_FOLDER}/ppt_download.txt" - - with open(aria2_input_file, "w") as f: - for slide in ppt_raw_data['data']['slides']: - if not slide.get('cover'): - continue - - f.write(f"{slide['cover']}\n out={DOWNLOAD_FOLDER}/{name_prefix}/{slide['index']}.jpg\n") - images.append(f"{DOWNLOAD_FOLDER}/{name_prefix}/{slide['index']}.jpg") - - # if os.path.exists(f"{DOWNLOAD_FOLDER}/{name_prefix}/{slide['index']}.jpg"): - # print(f"Skipping {name_prefix} - {slide['index']}") - # continue - # - # with open(f"{DOWNLOAD_FOLDER}/{name_prefix}/{slide['index']}.jpg", "wb") as f: - # f.write(requests.get(slide['cover']).content) - - cmd = f"aria2c -i {aria2_input_file} -x 16 -j 16 -c --log-level warn" - - popen(cmd, aria2c_interrupt, f"Failed to download {name_prefix}") - - from PIL import Image - - if args.ppt_problem_answer: - from PIL import ImageDraw, ImageFont - - for problem in ppt_raw_data['data']['slides']: - if problem['problem'] is None: - continue - - if not problem.get('cover'): - continue - - answer = "Answer: " + "; ".join(problem['problem']['content']['answer']) - - image = Image.open(f"{DOWNLOAD_FOLDER}/{name_prefix}/{problem['index']}.jpg").convert("RGB") - - draw = ImageDraw.Draw(image) - - # Load the font - font = ImageFont.load_default(size=40) - text_bbox = draw.textbbox(xy=(20, 20), text=answer, font=font) - - # Add semi-transparent black rectangle - draw.rectangle([text_bbox[0] - 10, text_bbox[1] - 10, text_bbox[2] + 10, text_bbox[3] + 10], fill="#bbb") - - # Draw the text on top (white) - draw.text((text_bbox[0], text_bbox[1]), answer, anchor="lt", font=font, fill="#333") - - image.save(f"{DOWNLOAD_FOLDER}/{name_prefix}/{problem['index']}-ans.jpg") - - # Replace the image in the list - images[images.index(f"{DOWNLOAD_FOLDER}/{name_prefix}/{problem['index']}.jpg")] = f"{DOWNLOAD_FOLDER}/{name_prefix}/{problem['index']}-ans.jpg" - - print(f"Added Answer to {name_prefix} - {problem['index']}") - - if not args.ppt_to_pdf: - return - - print(f"Converting {name_prefix}") - - images = [Image.open(i) for i in images] - images[0].save(f"{DOWNLOAD_FOLDER}/{name_prefix}.pdf", "PDF", resolution=100.0, save_all=True, append_images=images[1:]) - - print(f"Converted {name_prefix}") + except Exception as e: + print(traceback.format_exc()) + print(f"Failed to download PPT {name_prefix} - {ppt['title']}", file=sys.stderr) # --- --- --- Section Main --- --- --- # - -def thread_worker(course): - # Make a thread-specific cache folder - TEMP_FOLDER = tempfile.mkdtemp() - print(f"Temp Folder: {TEMP_FOLDER}") - - try: - get_lesson_list(course, TEMP_FOLDER) - except Exception as e: - print(e) - print(f"Failed to parse {course['name']}", file=sys.stderr) - - # Remove temp folder - print("Removing Temp Folder") - os.system(f"rm -rf {TEMP_FOLDER}") - +print('successfully parsed account info!') for course in courses: - pool.apply_async(thread_worker, (course,)) -try: - pool.close() - pool.join() -except KeyboardInterrupt: - print("Interrupted") - interrupted = True - pool.terminate() - pool.join() + skip_flag = 0 + try: + print(course) + if not allin_flag: + skip_flag = option.ask_for_input() + if skip_flag: + continue + else: + get_lesson_list(course) + else: + get_lesson_list(course) + except Exception as e: + print(traceback.format_exc()) + print(f"Failed to parse {course['name']}", file=sys.stderr) diff --git a/option.py b/option.py new file mode 100644 index 0000000..f8b6517 --- /dev/null +++ b/option.py @@ -0,0 +1,67 @@ +import re +import sys + + +def ask_for_input(): + while True: + user_input = input("Do you want to continue/abort/skip_current? (y/n/s): ").lower() + if user_input == 'y': + print("Proceeding...") + return 0 # Don't skip + elif user_input == 'n': + print("Aborting the program.") + sys.exit() # Exit the program if 'n' is chosen + elif user_input == 's': + print("Skipping current...") + return 1 # Set skip_flag to 1 + else: + print("Invalid input, please enter 'y', 'n', or 's'.") + + +def ask_for_allin(): + while True: + print('asking for whether to download all at once...') + confirmation = input( + "All in Means download everything at once.\n" + "This may take a long time and require over 100G of disk space.\n" + " Are you sure? (y/n): ").lower() + if confirmation == 'y': + print("All in! Ensure more than 100G disk space available in current directory!!!") + print("May take a looooooong time to finish!!!") + return 1 # Set allin_flag to 1 + elif confirmation == 'n': + print("Cancelled 'All in' operation.") + return 0 + else: + print("Invalid input, please enter 'y' or 'n'.") + + +def ask_for_idm(): + while True: + print('asking for whether to download with IDM...') + confirmation = input( + "IDM is a fast parallel downloader.\n" + "You need to install IDM and add idman.exe to SYSTEM PATH!!!\n" + "Without installing IDM the script won't run!!!!!!!!\n" + " Are you sure? (y/n): ").lower() + if confirmation == 'y': + print("Choosing IDM as download method") + print("Enjoy fast downloading") + return 1 # Set idm_flag to 1 + elif confirmation == 'n': + print("Choosing default download method") + return 0 + else: + print("Invalid input, please enter 'y' or 'n'.") + +def windows_filesame_sanitizer(input_str): + # Remove illegal characters for Windows filenames + input_str = re.sub(r'[<>:"\\|?*\x00-\x1F]', '_', input_str) + input_str = re.sub(r'[\x80-\xFF]', '', input_str) + # Step 2: Preserve the first `/` and replace the rest with underscores + parts = input_str.split("/", 1) # Split into two parts at the first slash + if len(parts) > 1: + input_str = parts[0] + "/" + parts[1].replace("/", "_") # Preserve first slash, replace others + else: + input_str = parts[0] # No slashes found + return input_str \ No newline at end of file diff --git a/ppt_processing.py b/ppt_processing.py new file mode 100644 index 0000000..479bf5e --- /dev/null +++ b/ppt_processing.py @@ -0,0 +1,105 @@ +import os +import time +import subprocess +import re +import option +import sys + +WINDOWS = sys.platform == 'win32' + + +def download_ppt(version, arg_ans, arg_pdf, CACHE_FOLDER, DOWNLOAD_FOLDER, ARIA2C_PATH, ppt_raw_data, name_prefix: str = ""): + print(f"Downloading {name_prefix}") + + if version == 1: + name_prefix += "-" + ppt_raw_data['data']['title'].rstrip() + else: + name_prefix += "-" + ppt_raw_data['data']['presentation']['title'].rstrip() + + name_prefix = option.windows_filesame_sanitizer(name_prefix) + + # If PDF is present, skip + if os.path.exists(f"{DOWNLOAD_FOLDER}/{name_prefix}.pdf"): + print(f"Skipping {name_prefix} - PDF already present") + time.sleep(0.25) + return + + os.makedirs(f"{DOWNLOAD_FOLDER}/{name_prefix}", exist_ok=True) + + images = [] + + if version == 1: + with open(f"{CACHE_FOLDER}/ppt_download.txt", "w", encoding='utf-8') as f: + for slide in ppt_raw_data['data']['slides']: + if not slide.get('Cover'): + continue + + f.write(f"{slide['Cover']}\n out={DOWNLOAD_FOLDER}/{name_prefix}/{slide['Index']}.jpg\n") + images.append(f"{DOWNLOAD_FOLDER}/{name_prefix}/{slide['Index']}.jpg") + + else: + with open(f"{CACHE_FOLDER}/ppt_download.txt", "w", encoding='utf-8') as f: + for slide in ppt_raw_data['data']['slides']: + if not slide.get('cover'): + continue + + f.write(f"{slide['cover']}\n out={DOWNLOAD_FOLDER}/{name_prefix}/{slide['index']}.jpg\n") + images.append(f"{DOWNLOAD_FOLDER}/{name_prefix}/{slide['index']}.jpg") + + ppt_download_command = (f"{ARIA2C_PATH} -i {CACHE_FOLDER}/ppt_download.txt -x 16 -s 16 -c " + f"-l aria2c_ppt.log --log-level warn") + + if WINDOWS: + subprocess.run(['powershell', '-Command', ppt_download_command], text=True) + else: + subprocess.run(ppt_download_command, shell=True) + + from PIL import Image + + if arg_ans and version != 1: + from PIL import ImageDraw, ImageFont + + for problem in ppt_raw_data['data']['slides']: + if problem['problem'] is None: + continue + + if not problem.get('cover'): + continue + + answer = "Answer: " + "; ".join(problem['problem']['content']['answer']) + + image = Image.open(f"{DOWNLOAD_FOLDER}/{name_prefix}/{problem['index']}.jpg").convert("RGB") + + draw = ImageDraw.Draw(image) + + # Load the font + font = ImageFont.load_default(size=40) + text_bbox = draw.textbbox(xy=(20, 20), text=answer, font=font) + + # Add semi-transparent black rectangle + draw.rectangle([text_bbox[0] - 10, text_bbox[1] - 10, text_bbox[2] + 10, text_bbox[3] + 10], fill="#bbb") + + # Draw the text on top (white) + draw.text((text_bbox[0], text_bbox[1]), answer, anchor="lt", font=font, fill="#333") + + image.save(f"{DOWNLOAD_FOLDER}/{name_prefix}/{problem['index']}-ans.jpg") + + # Replace the image in the list + images[images.index( + f"{DOWNLOAD_FOLDER}/{name_prefix}/{problem['index']}.jpg")] = f"{DOWNLOAD_FOLDER}/{name_prefix}/{problem['index']}-ans.jpg" + + print(f"Added Answer to {name_prefix} - {problem['index']}") + + if not arg_pdf: + return + + print(f"Converting {name_prefix}") + + images = [Image.open(i) for i in images] + images[0].save(f"{DOWNLOAD_FOLDER}/{name_prefix}.pdf", "PDF", resolution=100.0, save_all=True, + append_images=images[1:]) + + print(f"Converted {name_prefix}") + + # can be done like this TODO + # l2 = map(lambda x: x['B2'], ppt_raw_data['data']['slides']) diff --git a/scrap_example.txt b/scrap_example.txt new file mode 100644 index 0000000..656ad98 --- /dev/null +++ b/scrap_example.txt @@ -0,0 +1,253 @@ +# --- --- --- Section Get Lesson List --- --- --- # +# { +# "university_name": "", +# "term": 202401, +# "university_logo_pic": "", +# "name": "NAME", +# "type_count": [], +# "students_count": 7, +# "color_system": 3, +# "course": { +# "update_time": "", +# "name": "", +# "admin_id": 0, +# "university_id": 0, +# "type": 0, +# "id": 0 +# }, +# "teacher": { +# "user_id": 0, +# "name": "", +# "avatar": "" +# }, +# "create_time": "", +# "university_id": 0, +# "time": "", +# "course_id": 0, +# "university_logo": "0", +# "university_mini_logo": "0", +# "id": 0, +# "is_pro": true, +# "color_code": 0 +# } + + +# --- --- --- Section Download Lesson Video --- --- --- # +# { +# "type": 14, +# "id": 7153416, +# "courseware_id": "909642544544463488", +# "title": "R8-三相-周期非正弦", +# "create_time": 1686274642000, +# "attend_status": true, +# "is_finished": true +# } + +# --- --- --- Section Download Lesson Video Old--- --- --- # + +{ + "msg": "", + "data": { + "lesson_duration": 6923000, + "live_timeline": [ + { + "replay_url": "https://bd-snap-video.xuetangx.com/bd-flv-video.xuetangx.com/xuetanglive/ProLive-3393352-75277078/xuetang_live_20200408132036.m3u8", + "duration": 171000, + "room_code": "ProLive-3393352-75277078", + "absolute_start": 1586323233000, + "absolute_end": 1586323404000, + "hidden_status": false, + "related_start": 35000, + "source": "bd", + "live_id": 286112, + "type": 2, + "order": 0 + }, + { + "replay_url": "https://kszt-playback.xuetangx.com/gifshow-xuetangx/a0b01f6f5285890801042726599/playlist_eof.m3u8?auth_key=1729973098-4419371689350062664-0-7af7d877d498d161e2b39ecc659f63bb", + "duration": 5850000, + "room_code": "kszt_WHY6X1WB3_Q", + "absolute_start": 1586324173000, + "absolute_end": 1586330023000, + "hidden_status": false, + "related_start": -12000, + "source": "th", + "live_id": 286180, + "type": 2, + "order": 1 + } + ], + "recorded_video": [], + "danmu_timeline": [], + "lesson_timeline": [ + { + "duration": 12748, + "dt": 1586323199012, + "code": "LESSON_START", + "type": "event", + "title": "\u4e0a\u8bfe\u5566\uff01" + }, + ], + "hidden_status": false + }, + "success": true +} + +# --- --- --- Section Download Segment --- --- --- # +# { +# "id": "743834725938342272", +# "code": "kszt_DdQU9sOod7o", +# "type": 2, +# "source": "th", +# "url": "https://kszt-playback.xuetangx.com/gifshow-xuetangx/73466bdb387702307504996781/f0.mp4?auth_key=1729778852-4128559473511008914-0-e0c959d1504f92ef5a5d45000f46330d", +# "start": 1666508813000, +# "end": 1666510612000, +# "duration": 1799000, +# "hiddenStatus": 0, +# "order": 0, +# "replayOssStatus": 0, +# "recordFileId": "", +# "recordType": "", +# "subtitlePath": "" +# } + + +# --- --- --- Section Download Lesson PPT --- --- --- # +# { +# "code": 0, +# "msg": "OK", +# "data": { +# "lesson": { +# "id": "1267751345493205504", +# "title": "计算机网络原理 课堂提问(2)", +# "startTime": 1728964537223, +# "endTime": 1728965834350, +# "teacherIdentityId": "15753469", +# "classroom": { +# "id": "3134428", +# "name": "2024秋-计算机网络原理-2", +# "pro": true +# }, +# "course": { +# "id": "1360043", +# "name": "计算机网络原理" +# } +# }, +# "fileSharing": { +# "count": 0, +# "cover": null +# }, +# "teacher": { +# "identityId": "15753469", +# "avatar": "0", +# "name": "徐明伟", +# "number": "1998990267" +# }, +# "replayType": 0, +# "replayOssStatus": 0, +# "presentations": [ +# { +# "id": "1267751453966295552", +# "title": "计算机网络原理 课堂提问", +# "cover": "https://thu-private-qn.yuketang.cn/slide/11789532/cover435_20241015115349.jpg?e=1729790227&token=IAM-gs8ue1pDIGwtR1CR0Zjdagg7Q2tn5G_1BqTmhmqa:CkT5RMnhXFxcVUWoxVU2xxjA6ac=", +# "slidesCount": 21, +# "totalSlidesCount": 59, +# "doubtCount": 0, +# "collectCount": 0, +# "conf": "{\"show_presentation\":\"film\",\"slides\":[\"1267751453966295553\",\"1267751453974684160\",\"1267751453974684162\",\"1267751453974684164\",\"1267751453983072768\",\"1267751453983072770\",\"1267751453983072772\",\"1267751453983072774\",\"1267751453991461376\",\"1267751453991461378\",\"1267751453991461379\",\"1267751453999849984\",\"1267751453999849985\",\"1267751453999849986\",\"1267751453999849987\",\"1267751454008238592\",\"1267751454008238593\",\"1267751454008238594\",\"1267751454008238595\",\"1267751454016627200\",\"1267751454016627201\"],\"hide_slides\":[]}" +# } +# ], +# "user": { +# "identityId": "21640720", +# "avatar": "http://qn-sx.yuketang.cn/tougao_pic_AFzVVZcN5p9.png", +# "name": "[REDACTED]", +# "number": "2022010426" +# }, +# "activityId": "7970721", +# "memoContent": "", +# "liveViewed": false, +# "doubtSlides": [], +# "collectSlides": [], +# "checkIn": { +# "lessonId": "1267751345493205504", +# "identityId": "21640720", +# "score": 1000, +# "source": 5, +# "time": 1728964549329, +# "valid": 1, +# "problemScore": 1000, +# "quizScore": -1, +# "duration": 0, +# "addScore": null, +# "redEnvelope": 0, +# "correctCount": 10, +# "incorrectCount": 4, +# "unMarkCount": 0 +# }, +# "quizzes": [], +# "danmuList": [], +# "tougaoList": [], +# "toastType": 0, +# "problems": [ +# { +# "problemId": "1267751453974684162", +# "problemType": 1, +# "problemScore": 100, +# "index": 3, +# "cover": "https://thu-private-qn.yuketang.cn/slide/11789532/cover433_20241015115349.jpg?e=1729790227&token=IAM-gs8ue1pDIGwtR1CR0Zjdagg7Q2tn5G_1BqTmhmqa:hLi2EEWUQHgVeKyP9y4bi7neaYQ=", +# "presentationId": "1267751453966295552", +# "answer": [ +# "D" +# ], +# "ans_type": "", +# "comment": {}, +# "correctAnswer": [ +# "D" +# ], +# "score": 100, +# "submitTime": 1728964586371, +# "scoreTime": 0, +# "correct": true, +# "blankStatus": [], +# "anonymous": null, +# "remarkDetail": {}, +# "teamInfo": null +# } +# ] +# } +# } + + +# --- --- --- Section Download PPT --- --- --- # +# { +# "code": 0, +# "msg": "OK", +# "data": { +# "presentation": { +# "id": "714674183600571776", +# "title": "L1_课程介绍", +# "cover": "https://qn-st0.yuketang.cn/FudgWS2XoU3bXLxReeSBBhYTWJsX", +# "width": 720, +# "height": 540, +# "conf": { +# "show_presentation": "all", +# "slides": [ +# "714674183617348992" +# ], +# "hide_slides": [] +# } +# }, +# "slides": [ +# { +# "id": "714674183617348992", +# "index": 1, +# "doubtCount": 0, +# "collectCount": 0, +# "cover": "https://qn-st0.yuketang.cn/FudgWS2XoU3bXLxReeSBBhYTWJsX", +# "problem": null, +# "result": null +# } +# ] +# } +# } + diff --git a/video_processing.py b/video_processing.py new file mode 100644 index 0000000..1db3361 --- /dev/null +++ b/video_processing.py @@ -0,0 +1,300 @@ +import os +import re +import subprocess +import sys +import time +import traceback +from concurrent.futures import ThreadPoolExecutor, as_completed +import shutil + +FFMPEG_PATH = "ffmpeg" if shutil.which("ffmpeg") else os.path.join(os.getcwd(), "ffmpeg") +ARIA2C_PATH = "aria2c" if shutil.which("aria2c") else os.path.join(os.getcwd(), "aria2c") +WINDOWS = sys.platform == 'win32' + + +def download_segment(CACHE_FOLDER, url: str, order: int, name_prefix: str = ""): + print(f"Downloading {name_prefix} - {order}") + + video_download_command = (f"{ARIA2C_PATH} -o '{CACHE_FOLDER}/{name_prefix}-{order}.mp4'" + f" -x 16 -s 16 '{url}' -c -l aria2c_video.log --log-level warn") + + if WINDOWS: + result = subprocess.run(['powershell', '-Command', video_download_command], text=True) + else: + result = subprocess.run(video_download_command, shell=True) + + return result + + +def download_segment_idm(CACHE_FOLDER, url: str, order: int, name_prefix: str = ""): + print(f"Downloading {name_prefix} - {order}") + + name_prefix_idm = name_prefix.replace('/', '\\') + + # Define the full path for the downloaded file + downloaded_file = os.path.join(CACHE_FOLDER, f"{name_prefix_idm}-{order}.mp4") + print("Downloading to", downloaded_file) + + if os.path.exists(downloaded_file): + print(f"Skipping {downloaded_file} - Video already present") + time.sleep(0.25) + return downloaded_file + + # Construct the IDM download command + video_download_command = ( + f"idman /n /d \"{url}\" /p \"$(pwd)\" /f \"{downloaded_file}\"" + ) + + # Start the IDM download process + subprocess.run(['powershell', '-Command', video_download_command], text=True) + + # Wait for the file to appear and its download to complete + print(f"Waiting for {downloaded_file} to finish downloading...") + try: + # Wait until the file is created + while not os.path.exists(downloaded_file): + time.sleep(1) # Wait until the file is created + + # Wait until the file stops growing + prev_size = -1 + while True: + curr_size = os.path.getsize(downloaded_file) + if curr_size == prev_size: + break # File size hasn't changed, download likely complete + prev_size = curr_size + time.sleep(1) # Check every second + + except KeyboardInterrupt: + print("\nDownload interrupted by user.") + # Optionally, clean up the partially downloaded file + if os.path.exists(downloaded_file): + print(f"Removing incomplete file: {downloaded_file}") + os.remove(downloaded_file) + raise # Re-raise the exception to propagate it + + print(f"Download completed: {downloaded_file}") + return downloaded_file + + +def download_segment_m3u8(idm_flag, CACHE_FOLDER, url: str, order: int, name_prefix: str = "", max_retries: int = 35): + print(f"Downloading {name_prefix} - {order}") + print(f"Downloading from {url}") + + # Initial download attempt with 32 workers + + # video_download_command = (f".\\ffmpeg -i '{url}' -c copy -n '{CACHE_FOLDER}/{name_prefix}-{order}.mp4'" + # f" -hide_banner -loglevel error -stats") + + # video_download_command = (f".\\HLSDownloader -u '{url}' -o '{CACHE_FOLDER}/{name_prefix}-{order}.mp4'" + # f" -w 32 -workers 32 ") + + # video_download_command = (f".\\m3u8dl-windows-amd64.exe -i '{url}' -o '{CACHE_FOLDER}/{name_prefix}-{order}.mp4'" + # f" -retry 1 -t '{CACHE_FOLDER}' -thread 32") + + # video_download_command = ( + # f".\\vsd.exe save '{url}' -o '{CACHE_FOLDER}/{name_prefix}-{order}.mp4' " + # f"-d {CACHE_FOLDER} --retry-count 15 -t 4") + + output_path = f"{CACHE_FOLDER}/{name_prefix}-{order}" + save_dir = os.path.dirname(output_path) + save_name = os.path.basename(output_path) + + if 'mp3' in url or not WINDOWS: + if idm_flag: + video_download_command = ( + f"idman /n /d \"{url}\" /p \"$(pwd)\" /f '{CACHE_FOLDER}/{name_prefix}-{order}.mp4'" + ) + else: + video_download_command = ( + f"{FFMPEG_PATH} -i '{url}' -c:v copy -c:a copy -n '{CACHE_FOLDER}/{name_prefix}-{order}.mp4' " + f"-hide_banner -loglevel error -stats" + ) + + else: + video_download_command = ( + f".\\N_m3u8DL-RE.exe '{url}' --tmp-dir './{CACHE_FOLDER}' " + f"--save-dir '{save_dir}' --save-name '{save_name}' -M format=mp4 " + f"--check-segments-count false --download-retry-count 15 --thread-count 64" + ) + + if WINDOWS: + result = subprocess.run(['powershell', '-Command', video_download_command], text=True) + else: + result = subprocess.run(video_download_command, shell=True) + + return result + + +def download_segments_in_parallel(idm_flag, fallback_flag, CACHE_FOLDER, lesson_video_data, name_prefix): + has_error = False + + # MOOC TYPE + if fallback_flag == 2: + # Create a ThreadPoolExecutor to manage parallel downloads + with ThreadPoolExecutor(max_workers=6) as executor: + # Dictionary to hold future results + future_to_order = {} + + for order, url in enumerate(lesson_video_data): + if idm_flag: + future = executor.submit(download_segment_idm, CACHE_FOLDER, url, order, name_prefix) + else: + future = executor.submit(download_segment, CACHE_FOLDER, url, order, name_prefix) + + # Store the future and order for tracking + future_to_order[future] = order + + # Add a 1-second interval between submissions + time.sleep(1) + + # Iterate over the completed futures + for future in as_completed(future_to_order): + order = future_to_order[future] + try: + future.result() # Get the result (will raise exception if there was one) + print(f"Successfully downloaded {name_prefix} - {order}") + except Exception: + print(traceback.format_exc()) + print(f"Failed to download {name_prefix} - {order}", file=sys.stderr) + has_error = True + + # v1 type + elif fallback_flag == 1: + # Create a ThreadPoolExecutor to manage parallel downloads + with ThreadPoolExecutor(max_workers=6) as executor: + # Dictionary to hold future results + future_to_order = {} + + for order, segment in enumerate(lesson_video_data['data']['live_timeline']): + replay_url = segment['replay_url'] + + # Determine which function to use based on the presence of 'm3u8' in the replay_url + if 'm3u8' in replay_url: + future = executor.submit(download_segment_m3u8, idm_flag, CACHE_FOLDER, replay_url, order, + name_prefix, + max_retries=10) + else: + if idm_flag: + future = executor.submit(download_segment_idm, CACHE_FOLDER, replay_url, order, name_prefix) + else: + future = executor.submit(download_segment, CACHE_FOLDER, replay_url, order, name_prefix) + + # Store the future and order for tracking + future_to_order[future] = order + + # Add a 1-second interval between submissions + time.sleep(1) + + # Iterate over the completed futures + for future in as_completed(future_to_order): + order = future_to_order[future] + try: + future.result() # Get the result (will raise exception if there was one) + print(f"Successfully downloaded {name_prefix} - {order}") + except Exception: + print(traceback.format_exc()) + print(f"Failed to download {name_prefix} - {order}", file=sys.stderr) + has_error = True + + # v3 type + else: + # Create a ThreadPoolExecutor to manage parallel downloads + with ThreadPoolExecutor(max_workers=6) as executor: + # Dictionary to hold future results + future_to_order = {} + + for order, segment in enumerate(lesson_video_data['data']['live']): + url = segment['url'] + + # Determine which function to use based on the presence of 'm3u8' in the URL + if 'm3u8' in url: + future = executor.submit(download_segment_m3u8, idm_flag, CACHE_FOLDER, url, order, name_prefix, + max_retries=10) + else: + if idm_flag: + future = executor.submit(download_segment_idm, CACHE_FOLDER, url, order, name_prefix) + else: + future = executor.submit(download_segment, CACHE_FOLDER, url, order, name_prefix) + + # Store the future and order for tracking + future_to_order[future] = order + + # Add a 1-second interval between submissions + time.sleep(1) + + # Iterate over the completed futures + for future in as_completed(future_to_order): + order = future_to_order[future] + try: + future.result() # Get the result (will raise exception if there was one) + print(f"Successfully downloaded {name_prefix} - {order}") + except Exception: + print(traceback.format_exc()) + print(f"Failed to download {name_prefix} - {order}", file=sys.stderr) + has_error = True + + return has_error + + +def concatenate_segments(CACHE_FOLDER, DOWNLOAD_FOLDER, name_prefix, num_segments): + # Create the concat file with segment paths + with open(f"{CACHE_FOLDER}/concat.txt", "w", encoding='utf-8') as f: + for i in range(num_segments): + video_file_mp4 = f"../{CACHE_FOLDER}/{name_prefix}-{i}.mp4" + video_file_ts = f"../{CACHE_FOLDER}/{name_prefix}-{i}.ts" + if os.path.exists(os.path.join(CACHE_FOLDER, f"{name_prefix}-{i}.mp4")): # Check if the file exists + f.write(f"file '{video_file_mp4}'\n") + if os.path.exists(os.path.join(CACHE_FOLDER, f"{name_prefix}-{i}.ts")): # Check if the file exists + f.write(f"file '{video_file_ts}'\n") + + target_file = os.path.join(DOWNLOAD_FOLDER, f"{name_prefix}.mp4") + if os.path.exists(target_file): + print(f"Skipping '{DOWNLOAD_FOLDER}/{name_prefix}.mp4' - Video already present") + time.sleep(0.25) + return target_file + + # First video concatenation command using CUDA acceleration + video_concatenating_command = ( + f"{FFMPEG_PATH} -f concat -safe 0 -hwaccel cuda -hwaccel_output_format cuda " + f"-i '{CACHE_FOLDER}/concat.txt' " + f"-c:v av1_nvenc -cq 36 -g 200 -bf 7 -b_strategy 1 -sc_threshold 80 -me_range 16 " + f"-surfaces 64 -bufsize 12800k -refs 16 -r 7.5 -temporal-aq 1 -rc-lookahead 127 " + f"-c:a aac -ac 1 -rematrix_maxval 1.0 -b:a 64k '{DOWNLOAD_FOLDER}/{name_prefix}.mp4' -n " + f"-hide_banner -loglevel error -stats" + ) + + # Run the first command + if WINDOWS: + result = subprocess.run(['powershell', '-Command', video_concatenating_command], text=True) + else: + result = subprocess.run(video_concatenating_command, shell=True) + + # If the first command fails, try the fallback + if result.returncode != 0: + print(f"First attempt failed. Attempting fallback with software decoding.") + + # Fallback video concatenation command using cuvid acceleration + video_concatenating_command_fallback = ( + f"{FFMPEG_PATH} -f concat -safe 0 " + f"-i '{CACHE_FOLDER}/concat.txt' " + f"-c:v av1_nvenc -cq 36 -g 200 -bf 7 -b_strategy 1 -sc_threshold 80 -me_range 16 " + f"-surfaces 64 -bufsize 12800k -refs 16 -r 7.5 -temporal-aq 1 -rc-lookahead 127 " + f"-c:a copy '{DOWNLOAD_FOLDER}/{name_prefix}.mp4' -y " + f"-hide_banner -loglevel error -stats -err_detect ignore_err -fflags +discardcorrupt" + ) + + # Run the fallback command + if WINDOWS: + fallback_result = subprocess.run(['powershell', '-Command', video_concatenating_command_fallback], text=True) + else: + fallback_result = subprocess.run(video_concatenating_command_fallback, shell=True) + + # Check if the fallback also fails + if fallback_result.returncode != 0: + print(f"Both attempts failed to concatenate video segments.") + else: + print(f"Successfully concatenated video segments.") + else: + print(f"Successfully concatenated video segments using CUDA acceleration.") + + return result