diff --git a/main_windows.py b/main_windows.py index 4fbcf10..d3be313 100644 --- a/main_windows.py +++ b/main_windows.py @@ -3,12 +3,18 @@ import argparse import time import re +import traceback +import option parser = argparse.ArgumentParser(add_help=False) parser.add_argument("-h", "--help", action="store_true", help="Show this help message and exit") parser.add_argument("-c", "--session-cookie", help="Session Cookie", required=False) parser.add_argument("-y", "--ykt-host", help="RainClassroom Host", required=False, default="pro.yuketang.cn") +parser.add_argument("-i", "--idm", action="store_true", help="Use IDMan.exe") +parser.add_argument("-ni", "--no-idm", action="store_true", help="Don't use IDMan.exe") +parser.add_argument("-a", "--all", action="store_true", help="All in") +parser.add_argument("-na", "--no-all", action="store_true", help="No All in") parser.add_argument("--video", action="store_true", help="Download Video") parser.add_argument("--ppt", action="store_true", help="Download PPT") parser.add_argument("--ppt-to-pdf", action="store_true", help="Convert PPT to PDF", default=True) @@ -131,11 +137,11 @@ def get_lesson_list(course: dict, name_prefix: str = ""): lesson_data = rainclassroom_sess.get( - f"https://{YKT_HOST}/v2/api/web/logs/learn/{course['classroom_id']}?actype=14&page=0&offset=500&sort=-1").json() + f"https://{YKT_HOST}/v2/api/web/logs/learn/{course['classroom_id']}?actype=-1&page=0&offset=500&sort=-1").json() folder_name = f"{course['name']}-{course['teacher']['name']}" - folder_name = re.sub(r'[<>:"\\|?*\x00-\x1F]', '_', folder_name) - folder_name = re.sub(r'[\x80-\xFF]', '', folder_name) + folder_name = option.windows_filesame_sanitizer(folder_name) + print('folder name would be:',folder_name) # Rename old folder if os.path.exists(f"{DOWNLOAD_FOLDER}/{course['name']}"): @@ -147,10 +153,9 @@ os.makedirs(f"{DOWNLOAD_FOLDER}/{folder_name}", exist_ok=True) os.makedirs(f"{CACHE_FOLDER}/{folder_name}", exist_ok=True) + name_prefix += folder_name.rstrip() + "/" - # Remove illegal characters for Windows filenames - name_prefix = re.sub(r'[<>:"\\|?*\x00-\x1F]', '_', name_prefix) - name_prefix = re.sub(r'[\x80-\xFF]', '', name_prefix) + name_prefix = option.windows_filesame_sanitizer(name_prefix) if args.lesson_name_filter is not None: lesson_data['data']['activities'] = [l for l in lesson_data['data']['activities'] if @@ -160,48 +165,70 @@ if args.video: for index, lesson in enumerate(lesson_data['data']['activities']): + if not lesson['type'] in [14, 15]: + continue + lesson['classroom_id'] = course['classroom_id'] # Lesson try: - download_lesson_video(lesson, name_prefix + str(length - index)) - except Exception as e: - print(e) + if lesson['type'] == 14: + print('Normal type detected!') + download_lesson_video(lesson, name_prefix + str(length - index)) + elif lesson['type'] == 15: + print('MOOC type detected!') + download_lesson_video_type15(lesson, name_prefix + str(length - index)) + except Exception: + print(traceback.format_exc()) print(f"Failed to download video for {name_prefix} - {lesson['title']}", file=sys.stderr) print('sbykt may not prepare cold data in one run, rescanning for missing ones') for index, lesson in enumerate(lesson_data['data']['activities']): + if not lesson['type'] in [14, 15]: + continue + lesson['classroom_id'] = course['classroom_id'] # Lesson try: - download_lesson_video(lesson, name_prefix + str(length - index)) - except Exception as e: - print(e) + if lesson['type'] == 14: + print('Normal type detected!') + download_lesson_video(lesson, name_prefix + str(length - index)) + elif lesson['type'] == 15: + print('MOOC type detected!') + download_lesson_video_type15(lesson, name_prefix + str(length - index)) + except Exception: + print(traceback.format_exc()) print(f"Failed to download video for {name_prefix} - {lesson['title']}", file=sys.stderr) if args.ppt: for index, lesson in enumerate(lesson_data['data']['activities']): + if lesson['type'] == 15: + print("mooc type has no ppts!") + continue lesson['classroom_id'] = course['classroom_id'] # Lesson try: download_lesson_ppt(lesson, name_prefix + str(length - index)) - except Exception as e: - print(e) + except Exception: + print(traceback.format_exc()) print(f"Failed to download PPT for {name_prefix} - {lesson['title']}", file=sys.stderr) print('sbykt may not prepare cold data in one run, rescanning for missing ones') for index, lesson in enumerate(lesson_data['data']['activities']): + if lesson['type'] == 15: + print("mooc type has no ppts!") + continue lesson['classroom_id'] = course['classroom_id'] # Lesson try: download_lesson_ppt(lesson, name_prefix + str(length - index)) - except Exception as e: - print(e) + except Exception: + print(traceback.format_exc()) print(f"Failed to download PPT for {name_prefix} - {lesson['title']}", file=sys.stderr) @@ -215,15 +242,10 @@ f"https://{YKT_HOST}/api/v3/lesson-summary/replay?lesson_id={lesson['courseware_id']}").json() name_prefix += "-" + lesson['title'].rstrip() - # Remove illegal characters for Windows filenames - name_prefix = re.sub(r'[<>:"\\|?*\x00-\x1F]', '_', name_prefix) - name_prefix = re.sub(r'[\x80-\xFF]', '', name_prefix) - # Step 2: Preserve the first `/` and replace the rest with underscores - parts = name_prefix.split("/", 1) # Split into two parts at the first slash - if len(parts) > 1: - name_prefix = parts[0] + "/" + parts[1].replace("/", "_") # Preserve first slash, replace others - else: - name_prefix = parts[0] # No slashes found + name_prefix = option.windows_filesame_sanitizer(name_prefix) + + if idm_flag: + name_prefix = re.sub(r'[“”]', '_', name_prefix) if 'live' not in lesson_video_data['data']: print(f"v3 protocol detection failed, falling back to v1") @@ -252,9 +274,9 @@ # Download segments in parallel try: - download_segments_in_parallel(fallback_flag, CACHE_FOLDER, lesson_video_data, name_prefix) - except Exception as e: - print(e) + download_segments_in_parallel(idm_flag, fallback_flag, CACHE_FOLDER, lesson_video_data, name_prefix) + except Exception: + print(traceback.format_exc()) print(f"Failed to download {name_prefix}", file=sys.stderr) has_error = True @@ -278,6 +300,76 @@ f.write(f"{name_prefix}\n") +def download_lesson_video_type15(lesson: dict, name_prefix: str = ""): + mooc_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/c27/online_courseware/xty/kls/pub_news/{lesson['courseware_id']}/", + headers={ + "Xtbz": "ykt", + "Classroom-Id": str(lesson['classroom_id']) + } + ).json() + + for chapter in mooc_data['data']['content_info']: + chapter_name = chapter['name'] + + for section in chapter['section_list']: + section_name = section['name'] + + for lesson_d in section['leaf_list']: + lesson_name = lesson_d['title'] + lesson_id = lesson_d['id'] + has_error = False + + name_prefix_lesson = name_prefix + chapter_name + " - " + section_name + " - " + lesson_name + name_prefix_lesson = option.windows_filesame_sanitizer(name_prefix_lesson) + + if idm_flag: + name_prefix_lesson = re.sub(r'[“”]', '_', name_prefix_lesson) + + mooc_lesson_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/mooc-api/v1/lms/learn/leaf_info/{str(lesson['classroom_id'])}/{str(lesson_id)}/", + headers={ + "Xtbz": "ykt", + "Classroom-Id": str(lesson['classroom_id']) + } + ).json() + + mooc_media_id = mooc_lesson_data['data']['content_info']['media']['ccid'] + + mooc_media_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/api/open/audiovideo/playurl?video_id={mooc_media_id}&provider=cc&is_single=0&format=json" + ).json() + + quality_keys = list(map(lambda x: (int(x[7:]), x), mooc_media_data['data']['playurl']['sources'].keys())) + quality_keys.sort(key=lambda x: x[0], reverse=True) + download_url_list = mooc_media_data['data']['playurl']['sources'][quality_keys[0][1]] + # print(download_url_list) + + # Download segments in parallel + try: + download_segments_in_parallel(idm_flag, 2, CACHE_FOLDER, download_url_list, name_prefix_lesson) + except Exception: + print(traceback.format_exc()) + print(f"Failed to download {name_prefix}", file=sys.stderr) + has_error = True + + # Start concatenation if downloads were successful + if not has_error: + time.sleep(1) + if 'playurl' in mooc_media_data['data'] and len(download_url_list) > 0: + print(f"Concatenating {name_prefix}") + concatenate_segments(CACHE_FOLDER, DOWNLOAD_FOLDER, name_prefix_lesson, len(download_url_list)) + else: + print('concatenate cannot start due to previous failure') + else: + print('concatenate cannot start due to previous failure') + + if has_error: + with open(f"{DOWNLOAD_FOLDER}/error.log", "a") as f: + f.write(f"{name_prefix}\n") + + + from ppt_processing import download_ppt @@ -286,9 +378,7 @@ f"https://{YKT_HOST}/api/v3/lesson-summary/student?lesson_id={lesson['courseware_id']}").json() name_prefix += "-" + lesson['title'].rstrip() - # Remove illegal characters for Windows filenames - name_prefix = re.sub(r'[<>:"\\|?*\x00-\x1F]', '_', name_prefix) - name_prefix = re.sub(r'[\x80-\xFF]', '', name_prefix) + name_prefix = option.windows_filesame_sanitizer(name_prefix) if 'presentations' not in lesson_data['data']: print(f"v3 protocol detection failed, falling back to v1") @@ -308,7 +398,7 @@ ppt_raw_data, name_prefix + f"-{index}") except Exception as e: - print(e) + print(traceback.format_exc()) print(f"Failed to download PPT {name_prefix} - {ppt['title']}", file=sys.stderr) else: @@ -321,7 +411,7 @@ ppt_raw_data, name_prefix + f"-{index}") except Exception as e: - print(e) + print(traceback.format_exc()) print(f"Failed to download PPT {name_prefix} - {ppt['title']}", file=sys.stderr) @@ -331,7 +421,25 @@ import option as opt print('successfully parsed account info!') -allin_flag = opt.ask_for_allin() + +if args.all and args.no_all: + print("'-a' and '-na' cannot be used together") +if args.idm and args.no_idm: + print("'-idm' and '-no_idm' cannot be used together") + +if args.all: + allin_flag = 1 +elif args.no_all: + allin_flag = 0 +else: + allin_flag = opt.ask_for_allin() + +if args.idm: + idm_flag = 1 +elif args.no_idm: + idm_flag = 0 +else: + idm_flag = opt.ask_for_idm() for course in courses: skip_flag = 0 @@ -346,5 +454,5 @@ else: get_lesson_list(course) except Exception as e: - print(e) + print(traceback.format_exc()) print(f"Failed to parse {course['name']}", file=sys.stderr) diff --git a/option.py b/option.py index 9ee42d7..f8b6517 100644 --- a/option.py +++ b/option.py @@ -1,3 +1,4 @@ +import re import sys @@ -32,4 +33,35 @@ print("Cancelled 'All in' operation.") return 0 else: - print("Invalid input, please enter 'y' or 'n'.") \ No newline at end of file + print("Invalid input, please enter 'y' or 'n'.") + + +def ask_for_idm(): + while True: + print('asking for whether to download with IDM...') + confirmation = input( + "IDM is a fast parallel downloader.\n" + "You need to install IDM and add idman.exe to SYSTEM PATH!!!\n" + "Without installing IDM the script won't run!!!!!!!!\n" + " Are you sure? (y/n): ").lower() + if confirmation == 'y': + print("Choosing IDM as download method") + print("Enjoy fast downloading") + return 1 # Set idm_flag to 1 + elif confirmation == 'n': + print("Choosing default download method") + return 0 + else: + print("Invalid input, please enter 'y' or 'n'.") + +def windows_filesame_sanitizer(input_str): + # Remove illegal characters for Windows filenames + input_str = re.sub(r'[<>:"\\|?*\x00-\x1F]', '_', input_str) + input_str = re.sub(r'[\x80-\xFF]', '', input_str) + # Step 2: Preserve the first `/` and replace the rest with underscores + parts = input_str.split("/", 1) # Split into two parts at the first slash + if len(parts) > 1: + input_str = parts[0] + "/" + parts[1].replace("/", "_") # Preserve first slash, replace others + else: + input_str = parts[0] # No slashes found + return input_str \ No newline at end of file diff --git a/video_processing.py b/video_processing.py index aa57297..bbb4f79 100644 --- a/video_processing.py +++ b/video_processing.py @@ -1,7 +1,10 @@ import os +import re +import subprocess +import sys import time +import traceback from concurrent.futures import ThreadPoolExecutor, as_completed -import subprocess, sys def download_segment(CACHE_FOLDER, url: str, order: int, name_prefix: str = ""): @@ -9,12 +12,63 @@ video_download_command = (f".\\aria2c -o '{CACHE_FOLDER}/{name_prefix}-{order}.mp4'" f" -x 16 -s 16 '{url}' -c -l aria2c_video.log --log-level warn") + result = subprocess.run(['powershell', '-Command', video_download_command], text=True) return result -def download_segment_m3u8(CACHE_FOLDER, url: str, order: int, name_prefix: str = "", max_retries: int = 35): +def download_segment_idm(CACHE_FOLDER, url: str, order: int, name_prefix: str = ""): + print(f"Downloading {name_prefix} - {order}") + + name_prefix_idm = name_prefix.replace('/', '\\') + + # Define the full path for the downloaded file + downloaded_file = os.path.join(CACHE_FOLDER, f"{name_prefix_idm}-{order}.mp4") + print("Downloading to", downloaded_file) + + if os.path.exists(downloaded_file): + print(f"Skipping {downloaded_file} - Video already present") + time.sleep(0.25) + return downloaded_file + + # Construct the IDM download command + video_download_command = ( + f"idman /n /d \"{url}\" /p \"$(pwd)\" /f \"{downloaded_file}\"" + ) + + # Start the IDM download process + subprocess.run(['powershell', '-Command', video_download_command], text=True) + + # Wait for the file to appear and its download to complete + print(f"Waiting for {downloaded_file} to finish downloading...") + try: + # Wait until the file is created + while not os.path.exists(downloaded_file): + time.sleep(1) # Wait until the file is created + + # Wait until the file stops growing + prev_size = -1 + while True: + curr_size = os.path.getsize(downloaded_file) + if curr_size == prev_size: + break # File size hasn't changed, download likely complete + prev_size = curr_size + time.sleep(1) # Check every second + + except KeyboardInterrupt: + print("\nDownload interrupted by user.") + # Optionally, clean up the partially downloaded file + if os.path.exists(downloaded_file): + print(f"Removing incomplete file: {downloaded_file}") + os.remove(downloaded_file) + raise # Re-raise the exception to propagate it + + print(f"Download completed: {downloaded_file}") + return downloaded_file + + +def download_segment_m3u8(idm_flag, CACHE_FOLDER, url: str, order: int, name_prefix: str = "", max_retries: int = 35): print(f"Downloading {name_prefix} - {order}") print(f"Downloading from {url}") @@ -38,27 +92,65 @@ save_name = os.path.basename(output_path) if 'mp3' in url: - video_download_command = ( - f".\\ffmpeg -i '{url}' -c:v copy -c:a copy -n '{CACHE_FOLDER}/{name_prefix}-{order}.mp4' " - f"-hide_banner -loglevel error -stats" - ) + if idm_flag: + video_download_command = ( + f"idman /n /d \"{url}\" /p \"$(pwd)\" /f '{CACHE_FOLDER}/{name_prefix}-{order}.mp4'" + ) + else: + video_download_command = ( + f".\\ffmpeg -i '{url}' -c:v copy -c:a copy -n '{CACHE_FOLDER}/{name_prefix}-{order}.mp4' " + f"-hide_banner -loglevel error -stats" + ) + else: video_download_command = ( - f".\\N_m3u8DL-RE.exe '{url}' --tmp-dir './{CACHE_FOLDER}' " - f"--save-dir '{save_dir}' --save-name '{save_name}' -M format=mp4 " - f"--check-segments-count false --download-retry-count 15 --thread-count 64" + f".\\N_m3u8DL-RE.exe '{url}' --tmp-dir './{CACHE_FOLDER}' " + f"--save-dir '{save_dir}' --save-name '{save_name}' -M format=mp4 " + f"--check-segments-count false --download-retry-count 15 --thread-count 64" ) result = subprocess.run(['powershell', '-Command', video_download_command], text=True) return result -def download_segments_in_parallel(fallback_flag, CACHE_FOLDER, lesson_video_data, name_prefix): + +def download_segments_in_parallel(idm_flag, fallback_flag, CACHE_FOLDER, lesson_video_data, name_prefix): has_error = False - if fallback_flag: + # MOOC TYPE + if fallback_flag == 2: # Create a ThreadPoolExecutor to manage parallel downloads - with ThreadPoolExecutor(max_workers=3) as executor: + with ThreadPoolExecutor(max_workers=6) as executor: + # Dictionary to hold future results + future_to_order = {} + + for order, url in enumerate(lesson_video_data): + if idm_flag: + future = executor.submit(download_segment_idm, CACHE_FOLDER, url, order, name_prefix) + else: + future = executor.submit(download_segment, CACHE_FOLDER, url, order, name_prefix) + + # Store the future and order for tracking + future_to_order[future] = order + + # Add a 1-second interval between submissions + time.sleep(1) + + # Iterate over the completed futures + for future in as_completed(future_to_order): + order = future_to_order[future] + try: + future.result() # Get the result (will raise exception if there was one) + print(f"Successfully downloaded {name_prefix} - {order}") + except Exception: + print(traceback.format_exc()) + print(f"Failed to download {name_prefix} - {order}", file=sys.stderr) + has_error = True + + # v1 type + elif fallback_flag == 1: + # Create a ThreadPoolExecutor to manage parallel downloads + with ThreadPoolExecutor(max_workers=6) as executor: # Dictionary to hold future results future_to_order = {} @@ -67,28 +159,36 @@ # Determine which function to use based on the presence of 'm3u8' in the replay_url if 'm3u8' in replay_url: - future = executor.submit(download_segment_m3u8, CACHE_FOLDER, replay_url, order, name_prefix, + future = executor.submit(download_segment_m3u8, idm_flag, CACHE_FOLDER, replay_url, order, + name_prefix, max_retries=10) else: - future = executor.submit(download_segment, CACHE_FOLDER, replay_url, order, name_prefix) + if idm_flag: + future = executor.submit(download_segment_idm, CACHE_FOLDER, replay_url, order, name_prefix) + else: + future = executor.submit(download_segment, CACHE_FOLDER, replay_url, order, name_prefix) # Store the future and order for tracking future_to_order[future] = order + # Add a 1-second interval between submissions + time.sleep(1) + # Iterate over the completed futures for future in as_completed(future_to_order): order = future_to_order[future] try: future.result() # Get the result (will raise exception if there was one) print(f"Successfully downloaded {name_prefix} - {order}") - except Exception as e: - print(e) + except Exception: + print(traceback.format_exc()) print(f"Failed to download {name_prefix} - {order}", file=sys.stderr) has_error = True + # v3 type else: # Create a ThreadPoolExecutor to manage parallel downloads - with ThreadPoolExecutor(max_workers=3) as executor: + with ThreadPoolExecutor(max_workers=6) as executor: # Dictionary to hold future results future_to_order = {} @@ -97,22 +197,28 @@ # Determine which function to use based on the presence of 'm3u8' in the URL if 'm3u8' in url: - future = executor.submit(download_segment_m3u8, CACHE_FOLDER, url, order, name_prefix, + future = executor.submit(download_segment_m3u8, idm_flag, CACHE_FOLDER, url, order, name_prefix, max_retries=10) else: - future = executor.submit(download_segment, CACHE_FOLDER, url, order, name_prefix) + if idm_flag: + future = executor.submit(download_segment_idm, CACHE_FOLDER, url, order, name_prefix) + else: + future = executor.submit(download_segment, CACHE_FOLDER, url, order, name_prefix) # Store the future and order for tracking future_to_order[future] = order + # Add a 1-second interval between submissions + time.sleep(1) + # Iterate over the completed futures for future in as_completed(future_to_order): order = future_to_order[future] try: future.result() # Get the result (will raise exception if there was one) print(f"Successfully downloaded {name_prefix} - {order}") - except Exception as e: - print(e) + except Exception: + print(traceback.format_exc()) print(f"Failed to download {name_prefix} - {order}", file=sys.stderr) has_error = True @@ -120,7 +226,6 @@ def concatenate_segments(CACHE_FOLDER, DOWNLOAD_FOLDER, name_prefix, num_segments): - # Create the concat file with segment paths with open(f"{CACHE_FOLDER}/concat.txt", "w", encoding='utf-8') as f: for i in range(num_segments): @@ -131,11 +236,17 @@ if os.path.exists(os.path.join(CACHE_FOLDER, f"{name_prefix}-{i}.ts")): # Check if the file exists f.write(f"file '{video_file_ts}'\n") + target_file = os.path.join(DOWNLOAD_FOLDER, f"{name_prefix}.mp4") + if os.path.exists(target_file): + print(f"Skipping '{DOWNLOAD_FOLDER}/{name_prefix}.mp4' - Video already present") + time.sleep(0.25) + return target_file + # First video concatenation command using CUDA acceleration video_concatenating_command = ( f"ffmpeg -f concat -safe 0 -hwaccel cuda -hwaccel_output_format cuda " f"-i '{CACHE_FOLDER}/concat.txt' " - f"-c:v hevc_nvenc -cq 28 -surfaces 64 -bufsize 12800k -r 7.5 -rc-lookahead 63 " + f"-c:v av1_nvenc -cq 28 -surfaces 64 -bufsize 12800k -r 7.5 -rc-lookahead 63 " f"-c:a aac -ac 1 -rematrix_maxval 1.0 -b:a 64k '{DOWNLOAD_FOLDER}/{name_prefix}.mp4' -n " f"-hide_banner -loglevel error -stats" ) @@ -151,7 +262,7 @@ video_concatenating_command_fallback = ( f"ffmpeg -f concat -safe 0 " f"-i '{CACHE_FOLDER}/concat.txt' " - f"-c:v hevc_nvenc -cq 28 -surfaces 64 -bufsize 12800k -r 7.5 -rc-lookahead 63 " + f"-c:v av1_nvenc -cq 28 -surfaces 64 -bufsize 12800k -r 7.5 -rc-lookahead 63 " f"-c:a copy '{DOWNLOAD_FOLDER}/{name_prefix}.mp4' -y " f"-hide_banner -loglevel error -stats -err_detect ignore_err -fflags +discardcorrupt" )