diff --git a/main_windows.py b/main_windows.py index d3be313..0d9006c 100644 --- a/main_windows.py +++ b/main_windows.py @@ -141,6 +141,11 @@ folder_name = f"{course['name']}-{course['teacher']['name']}" folder_name = option.windows_filesame_sanitizer(folder_name) + + if idm_flag: + folder_name = folder_name.replace('/', '\\') + folder_name = re.sub(r'[“”]', '_', folder_name) + print('folder name would be:',folder_name) # Rename old folder @@ -165,7 +170,7 @@ if args.video: for index, lesson in enumerate(lesson_data['data']['activities']): - if not lesson['type'] in [14, 15]: + if not lesson['type'] in [14, 15, 17]: continue lesson['classroom_id'] = course['classroom_id'] @@ -176,8 +181,11 @@ print('Normal type detected!') download_lesson_video(lesson, name_prefix + str(length - index)) elif lesson['type'] == 15: - print('MOOC type detected!') + print('MOOCv2 type detected!') download_lesson_video_type15(lesson, name_prefix + str(length - index)) + elif lesson['type'] == 17: + print('MOOCv1 type detected!') + download_lesson_video_type17(lesson, name_prefix + str(length - index)) except Exception: print(traceback.format_exc()) print(f"Failed to download video for {name_prefix} - {lesson['title']}", file=sys.stderr) @@ -185,7 +193,7 @@ print('sbykt may not prepare cold data in one run, rescanning for missing ones') for index, lesson in enumerate(lesson_data['data']['activities']): - if not lesson['type'] in [14, 15]: + if not lesson['type'] in [14, 15, 17]: continue lesson['classroom_id'] = course['classroom_id'] @@ -196,15 +204,18 @@ print('Normal type detected!') download_lesson_video(lesson, name_prefix + str(length - index)) elif lesson['type'] == 15: - print('MOOC type detected!') + print('MOOCv2 type detected!') download_lesson_video_type15(lesson, name_prefix + str(length - index)) + elif lesson['type'] == 17: + print('MOOCv1 type detected!') + download_lesson_video_type17(lesson, name_prefix + str(length - index)) except Exception: print(traceback.format_exc()) print(f"Failed to download video for {name_prefix} - {lesson['title']}", file=sys.stderr) if args.ppt: for index, lesson in enumerate(lesson_data['data']['activities']): - if lesson['type'] == 15: + if lesson['type'] in (15, 17): print("mooc type has no ppts!") continue lesson['classroom_id'] = course['classroom_id'] @@ -219,7 +230,7 @@ print('sbykt may not prepare cold data in one run, rescanning for missing ones') for index, lesson in enumerate(lesson_data['data']['activities']): - if lesson['type'] == 15: + if lesson['type'] in (15, 17): print("mooc type has no ppts!") continue lesson['classroom_id'] = course['classroom_id'] @@ -312,6 +323,62 @@ for chapter in mooc_data['data']['content_info']: chapter_name = chapter['name'] + for orphan in chapter['leaf_list']: + orphan_title = orphan['title'] + orphan_id = orphan['id'] + has_error = False + + name_prefix_orphan = name_prefix + chapter_name + " - " + orphan_title + name_prefix_orphan = option.windows_filesame_sanitizer(name_prefix_orphan) + + if idm_flag: + name_prefix_orphan = re.sub(r'[“”]', '_', name_prefix_orphan) + + mooc_orphan_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/mooc-api/v1/lms/learn/leaf_info/{str(lesson['classroom_id'])}/{str(orphan_id)}/", + headers={ + "Xtbz": "ykt", + "Classroom-Id": str(lesson['classroom_id']) + } + ).json() + + if 'data' not in mooc_orphan_data or 'content_info' not in mooc_orphan_data['data']: + print('no media detected, skipping!') + continue + + mooc_orphan_media_id = mooc_orphan_data['data']['content_info']['media']['ccid'] + mooc_orphan_media_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/api/open/audiovideo/playurl?video_id={mooc_orphan_media_id}&provider=cc&is_single=0&format=json" + ).json() + + quality_keys = list(map(lambda x: (int(x[7:]), x), mooc_orphan_media_data['data']['playurl']['sources'].keys())) + quality_keys.sort(key=lambda x: x[0], reverse=True) + download_url_list = mooc_orphan_media_data['data']['playurl']['sources'][quality_keys[0][1]] + # print(download_url_list) + + # Download segments in parallel + try: + download_segments_in_parallel(idm_flag, 2, CACHE_FOLDER, download_url_list, name_prefix_orphan) + except Exception: + print(traceback.format_exc()) + print(f"Failed to download {name_prefix}", file=sys.stderr) + has_error = True + + # Start concatenation if downloads were successful + if not has_error: + time.sleep(0.25) + if 'playurl' in mooc_orphan_media_data['data'] and len(download_url_list) > 0: + print(f"Concatenating {name_prefix}") + concatenate_segments(CACHE_FOLDER, DOWNLOAD_FOLDER, name_prefix_orphan, len(download_url_list)) + else: + print('concatenate cannot start due to previous failure') + else: + print('concatenate cannot start due to previous failure') + + if has_error: + with open(f"{DOWNLOAD_FOLDER}/error.log", "a") as f: + f.write(f"{name_prefix}\n") + for section in chapter['section_list']: section_name = section['name'] @@ -334,6 +401,10 @@ } ).json() + if 'data' not in mooc_lesson_data or 'content_info' not in mooc_lesson_data['data']: + print('no media detected, skipping!') + continue + mooc_media_id = mooc_lesson_data['data']['content_info']['media']['ccid'] mooc_media_data = rainclassroom_sess.get( @@ -369,6 +440,76 @@ f.write(f"{name_prefix}\n") +def download_lesson_video_type17(lesson: dict, name_prefix: str = ""): + mooc_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/c27/online_courseware/xty/kls/pub_news/{lesson['courseware_id']}/", + headers={ + "Xtbz": "ykt", + "Classroom-Id": str(lesson['classroom_id']) + } + ).json() + + if 'name' not in mooc_data['data']['content_info'] or 'content_info' not in mooc_data['data']: + print('no media detected, skipping!') + return + + only_lesson_name = mooc_data['data']['content_info']['name'] + only_lesson_id = mooc_data['data']['content_info']['id'] + + has_error = False + + name_prefix_lesson = name_prefix + only_lesson_name + name_prefix_lesson = option.windows_filesame_sanitizer(name_prefix_lesson) + + if idm_flag: + name_prefix_lesson = re.sub(r'[“”]', '_', name_prefix_lesson) + + mooc_lesson_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/mooc-api/v1/lms/learn/leaf_info/{str(lesson['classroom_id'])}/{str(only_lesson_id)}/", + headers={ + "Xtbz": "ykt", + "Classroom-Id": str(lesson['classroom_id']) + } + ).json() + + if 'data' not in mooc_lesson_data or 'content_info' not in mooc_lesson_data['data']: + print('no media detected, skipping!') + return + + mooc_media_id = mooc_lesson_data['data']['content_info']['media']['ccid'] + + mooc_media_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/api/open/audiovideo/playurl?video_id={mooc_media_id}&provider=cc&is_single=0&format=json" + ).json() + + quality_keys = list(map(lambda x: (int(x[7:]), x), mooc_media_data['data']['playurl']['sources'].keys())) + quality_keys.sort(key=lambda x: x[0], reverse=True) + download_url_list = mooc_media_data['data']['playurl']['sources'][quality_keys[0][1]] + # print(download_url_list) + + # Download segments in parallel + try: + download_segments_in_parallel(idm_flag, 2, CACHE_FOLDER, download_url_list, name_prefix_lesson) + except Exception: + print(traceback.format_exc()) + print(f"Failed to download {name_prefix}", file=sys.stderr) + has_error = True + + # Start concatenation if downloads were successful + if not has_error: + time.sleep(1) + if 'playurl' in mooc_media_data['data'] and len(download_url_list) > 0: + print(f"Concatenating {name_prefix}") + concatenate_segments(CACHE_FOLDER, DOWNLOAD_FOLDER, name_prefix_lesson, len(download_url_list)) + else: + print('concatenate cannot start due to previous failure') + else: + print('concatenate cannot start due to previous failure') + + if has_error: + with open(f"{DOWNLOAD_FOLDER}/error.log", "a") as f: + f.write(f"{name_prefix}\n") + from ppt_processing import download_ppt diff --git a/ppt_processing.py b/ppt_processing.py index 7ec4375..a7580bf 100644 --- a/ppt_processing.py +++ b/ppt_processing.py @@ -2,6 +2,7 @@ import time import subprocess import re +import option def download_ppt(version, arg_ans, arg_pdf, CACHE_FOLDER, DOWNLOAD_FOLDER, ppt_raw_data, name_prefix: str = ""): @@ -12,15 +13,7 @@ else: name_prefix += "-" + ppt_raw_data['data']['presentation']['title'].rstrip() - # Remove illegal characters for Windows filenames - name_prefix = re.sub(r'[<>:"\\|?*\x00-\x1F]', '_', name_prefix) - name_prefix = re.sub(r'[\x80-\xFF]', '', name_prefix) - # Step 2: Preserve the first `/` and replace the rest with underscores - parts = name_prefix.split("/", 1) # Split into two parts at the first slash - if len(parts) > 1: - name_prefix = parts[0] + "/" + parts[1].replace("/", "_") # Preserve first slash, replace others - else: - name_prefix = parts[0] # No slashes found + name_prefix = option.windows_filesame_sanitizer(name_prefix) # If PDF is present, skip if os.path.exists(f"{DOWNLOAD_FOLDER}/{name_prefix}.pdf"): diff --git a/video_processing.py b/video_processing.py index bbb4f79..18c5157 100644 --- a/video_processing.py +++ b/video_processing.py @@ -246,7 +246,8 @@ video_concatenating_command = ( f"ffmpeg -f concat -safe 0 -hwaccel cuda -hwaccel_output_format cuda " f"-i '{CACHE_FOLDER}/concat.txt' " - f"-c:v av1_nvenc -cq 28 -surfaces 64 -bufsize 12800k -r 7.5 -rc-lookahead 63 " + f"-c:v av1_nvenc -cq 36 -g 200 -bf 7 -b_strategy 1 -sc_threshold 80 -me_range 16 " + f"-surfaces 64 -bufsize 12800k -refs 16 -r 7.5 -temporal-aq 1 -rc-lookahead 127 " f"-c:a aac -ac 1 -rematrix_maxval 1.0 -b:a 64k '{DOWNLOAD_FOLDER}/{name_prefix}.mp4' -n " f"-hide_banner -loglevel error -stats" ) @@ -262,7 +263,8 @@ video_concatenating_command_fallback = ( f"ffmpeg -f concat -safe 0 " f"-i '{CACHE_FOLDER}/concat.txt' " - f"-c:v av1_nvenc -cq 28 -surfaces 64 -bufsize 12800k -r 7.5 -rc-lookahead 63 " + f"-c:v av1_nvenc -cq 36 -g 200 -bf 7 -b_strategy 1 -sc_threshold 80 -me_range 16 " + f"-surfaces 64 -bufsize 12800k -refs 16 -r 7.5 -temporal-aq 1 -rc-lookahead 127 " f"-c:a copy '{DOWNLOAD_FOLDER}/{name_prefix}.mp4' -y " f"-hide_banner -loglevel error -stats -err_detect ignore_err -fflags +discardcorrupt" )