diff --git a/aria2c.exe b/aria2c.exe new file mode 100644 index 0000000..5004e10 --- /dev/null +++ b/aria2c.exe Binary files differ diff --git a/main_windows.py b/main_windows.py index 53a0c16..28e9fc2 100644 --- a/main_windows.py +++ b/main_windows.py @@ -157,6 +157,8 @@ if args.video: for index, lesson in enumerate(lesson_data['data']['activities']): + lesson['classroom_id'] = course['classroom_id'] + # Lesson try: download_lesson_video(lesson, name_prefix + str(length - index)) @@ -166,6 +168,8 @@ if args.ppt: for index, lesson in enumerate(lesson_data['data']['activities']): + lesson['classroom_id'] = course['classroom_id'] + # Lesson try: download_lesson_ppt(lesson, name_prefix + str(length - index)) @@ -183,33 +187,52 @@ lesson_video_data = rainclassroom_sess.get( f"https://{YKT_HOST}/api/v3/lesson-summary/replay?lesson_id={lesson['courseware_id']}").json() + name_prefix += "-" + lesson['title'].rstrip() # Remove illegal characters for Windows filenames name_prefix = re.sub(r'[<>:"\\|?*]', '_', name_prefix) if 'live' not in lesson_video_data['data']: - print(f"Skipping {name_prefix} - No Video", file=sys.stderr) - return + print(f"v3 protocol detection failed, falling back to v1") + + fallback_flag = 1 + + lesson_video_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/v/lesson/get_lesson_replay_timeline/?lesson_id={lesson['courseware_id']}").json() + + if 'live_timeline' not in lesson_video_data['data']: + print(f"Skipping {name_prefix} - No Video", file=sys.stderr) + return + else: + fallback_flag = 0 if os.path.exists(f"{DOWNLOAD_FOLDER}/{name_prefix}.mp4"): print(f"Skipping {name_prefix} - Video already present") - time.sleep(0.5) + time.sleep(0.25) return has_error = False # Download segments in parallel try: - download_segments_in_parallel(CACHE_FOLDER, lesson_video_data, name_prefix) + download_segments_in_parallel(fallback_flag, CACHE_FOLDER, lesson_video_data, name_prefix) except Exception as e: print(e) print(f"Failed to download {name_prefix}", file=sys.stderr) has_error = True # Start concatenation if downloads were successful - if not has_error and len(lesson_video_data['data']['live']) > 0: - print(f"Concatenating {name_prefix}") - concatenate_segments(CACHE_FOLDER, DOWNLOAD_FOLDER, name_prefix, len(lesson_video_data['data']['live'])) + if not has_error: + if 'live' in lesson_video_data['data'] and len(lesson_video_data['data']['live']) > 0: + print(f"Concatenating {name_prefix}") + concatenate_segments(CACHE_FOLDER, DOWNLOAD_FOLDER, name_prefix, len(lesson_video_data['data']['live'])) + elif 'live_timeline' in lesson_video_data['data'] and len(lesson_video_data['data']['live_timeline']) > 0: + print(f"Concatenating {name_prefix}") + concatenate_segments(CACHE_FOLDER, DOWNLOAD_FOLDER, name_prefix, len(lesson_video_data['data']['live_timeline'])) + else: + print('concatenate cannot start due to previous failure') + else: + print('concatenate cannot start due to previous failure') if has_error: with open(f"{DOWNLOAD_FOLDER}/error.log", "a") as f: @@ -228,18 +251,38 @@ name_prefix = re.sub(r'[<>:"\\|?*]', '_', name_prefix) if 'presentations' not in lesson_data['data']: - print(f"Skipping {name_prefix} - No PPT", file=sys.stderr) - return + print(f"v3 protocol detection failed, falling back to v1") - for index, ppt in enumerate(lesson_data['data']['presentations']): - # PPT - try: - ppt_raw_data = rainclassroom_sess.get( - f"https://{YKT_HOST}/api/v3/lesson-summary/student/presentation?presentation_id={ppt['id']}&lesson_id={lesson["courseware_id"]}").json() - download_ppt(args.ppt_problem_answer, args.ppt_to_pdf, CACHE_FOLDER, DOWNLOAD_FOLDER, YKT_HOST, ppt_raw_data, name_prefix + f"-{index}") - except Exception as e: - print(e) - print(f"Failed to download PPT {name_prefix} - {ppt['title']}", file=sys.stderr) + ppt_info = rainclassroom_sess.get( + f"https://{YKT_HOST}/v2/api/web/lessonafter/{lesson['courseware_id']}/presentation?classroom_id={lesson['classroom_id']}").json() + if 'id' not in ppt_info['data'][0]: + print(f"Skipping {name_prefix} - No PPT", file=sys.stderr) + return + + for index, ppt in enumerate(ppt_info['data']): + # PPT + try: + ppt_raw_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/v2/api/web/lessonafter/presentation/{ppt['id']}?classroom_id={lesson['classroom_id']}").json() + download_ppt(1, args.ppt_problem_answer, args.ppt_to_pdf, CACHE_FOLDER, DOWNLOAD_FOLDER, + ppt_raw_data, name_prefix + f"-{index}") + + except Exception as e: + print(e) + print(f"Failed to download PPT {name_prefix} - {ppt['title']}", file=sys.stderr) + + else: + for index, ppt in enumerate(lesson_data['data']['presentations']): + # PPT + try: + ppt_raw_data = rainclassroom_sess.get( + f"https://{YKT_HOST}/api/v3/lesson-summary/student/presentation?presentation_id={ppt['id']}&lesson_id={lesson["courseware_id"]}").json() + download_ppt(3,args.ppt_problem_answer, args.ppt_to_pdf, CACHE_FOLDER, DOWNLOAD_FOLDER, + ppt_raw_data, name_prefix + f"-{index}") + + except Exception as e: + print(e) + print(f"Failed to download PPT {name_prefix} - {ppt['title']}", file=sys.stderr) # --- --- --- Section Main --- --- --- # diff --git a/ppt_processing.py b/ppt_processing.py index 2486310..747bbe4 100644 --- a/ppt_processing.py +++ b/ppt_processing.py @@ -3,52 +3,54 @@ import subprocess import re -CACHE_FOLDER = 'cache' -DOWNLOAD_FOLDER = 'data' -def download_ppt(arg_ans, arg_pdf, CACHE_FOLDER, DOWNLOAD_FOLDER, YKT_HOST, ppt_raw_data, name_prefix: str = ""): - +def download_ppt(version, arg_ans, arg_pdf, CACHE_FOLDER, DOWNLOAD_FOLDER, ppt_raw_data, name_prefix: str = ""): print(f"Downloading {name_prefix}") - # ppt_raw_data = rainclassroom_sess.get( - # f"https://{YKT_HOST}/api/v3/lesson-summary/student/presentation?presentation_id={ppt_id}&lesson_id={lesson_id}").json() - name_prefix += "-" + ppt_raw_data['data']['presentation']['title'].rstrip() + if version == 1: + name_prefix += "-" + ppt_raw_data['data']['title'].rstrip() + else: + name_prefix += "-" + ppt_raw_data['data']['presentation']['title'].rstrip() + # Remove illegal characters for Windows filenames name_prefix = re.sub(r'[<>:"\\|?*]', '_', name_prefix) # If PDF is present, skip if os.path.exists(f"{DOWNLOAD_FOLDER}/{name_prefix}.pdf"): print(f"Skipping {name_prefix} - PDF already present") - time.sleep(0.5) + time.sleep(0.25) return os.makedirs(f"{DOWNLOAD_FOLDER}/{name_prefix}", exist_ok=True) images = [] - with open(f"{CACHE_FOLDER}/ppt_download.txt", "w", encoding='utf-8') as f: - for slide in ppt_raw_data['data']['slides']: - if not slide.get('cover'): - continue + if version == 1: + with open(f"{CACHE_FOLDER}/ppt_download.txt", "w", encoding='utf-8') as f: + for slide in ppt_raw_data['data']['slides']: + if not slide.get('Cover'): + continue - f.write(f"{slide['cover']}\n out={DOWNLOAD_FOLDER}/{name_prefix}/{slide['index']}.jpg\n") - images.append(f"{DOWNLOAD_FOLDER}/{name_prefix}/{slide['index']}.jpg") + f.write(f"{slide['Cover']}\n out={DOWNLOAD_FOLDER}/{name_prefix}/{slide['Index']}.jpg\n") + images.append(f"{DOWNLOAD_FOLDER}/{name_prefix}/{slide['Index']}.jpg") - # if os.path.exists(f"{DOWNLOAD_FOLDER}/{name_prefix}/{slide['index']}.jpg"): - # print(f"Skipping {name_prefix} - {slide['index']}") - # continue - # - # with open(f"{DOWNLOAD_FOLDER}/{name_prefix}/{slide['index']}.jpg", "wb") as f: - # f.write(requests.get(slide['cover']).content) + else: + with open(f"{CACHE_FOLDER}/ppt_download.txt", "w", encoding='utf-8') as f: + for slide in ppt_raw_data['data']['slides']: + if not slide.get('cover'): + continue - ppt_download_command = (f"aria2c -i {CACHE_FOLDER}/ppt_download.txt -x 16 -s 16 -c " + f.write(f"{slide['cover']}\n out={DOWNLOAD_FOLDER}/{name_prefix}/{slide['index']}.jpg\n") + images.append(f"{DOWNLOAD_FOLDER}/{name_prefix}/{slide['index']}.jpg") + + ppt_download_command = (f".\\aria2c -i {CACHE_FOLDER}/ppt_download.txt -x 16 -s 16 -c " f"-l aria2c_ppt.log --log-level warn") - # os.system(f"aria2c -i {CACHE_FOLDER}/ppt_download.txt -x 16 -s 16 -c") + subprocess.run(['powershell', '-Command', ppt_download_command], text=True) from PIL import Image - if arg_ans: + if arg_ans and version != 1: from PIL import ImageDraw, ImageFont for problem in ppt_raw_data['data']['slides']: @@ -91,4 +93,7 @@ images[0].save(f"{DOWNLOAD_FOLDER}/{name_prefix}.pdf", "PDF", resolution=100.0, save_all=True, append_images=images[1:]) - print(f"Converted {name_prefix}") \ No newline at end of file + print(f"Converted {name_prefix}") + + # can be done like this TODO + # l2 = map(lambda x: x['B2'], ppt_raw_data['data']['slides']) diff --git a/scrap_example.txt b/scrap_example.txt index 8597018..dd58816 100644 --- a/scrap_example.txt +++ b/scrap_example.txt @@ -43,6 +43,55 @@ # "is_finished": true # } +# --- --- --- Section Download Lesson Video Old--- --- --- # + +{ + "msg": "", + "data": { + "lesson_duration": 6923000, + "live_timeline": [ + { + "replay_url": "https://bd-snap-video.xuetangx.com/bd-flv-video.xuetangx.com/xuetanglive/ProLive-3393352-75277078/xuetang_live_20200408132036.m3u8", + "duration": 171000, + "room_code": "ProLive-3393352-75277078", + "absolute_start": 1586323233000, + "absolute_end": 1586323404000, + "hidden_status": false, + "related_start": 35000, + "source": "bd", + "live_id": 286112, + "type": 2, + "order": 0 + }, + { + "replay_url": "https://kszt-playback.xuetangx.com/gifshow-xuetangx/a0b01f6f5285890801042726599/playlist_eof.m3u8?auth_key=1729973098-4419371689350062664-0-7af7d877d498d161e2b39ecc659f63bb", + "duration": 5850000, + "room_code": "kszt_WHY6X1WB3_Q", + "absolute_start": 1586324173000, + "absolute_end": 1586330023000, + "hidden_status": false, + "related_start": -12000, + "source": "th", + "live_id": 286180, + "type": 2, + "order": 1 + } + ], + "recorded_video": [], + "danmu_timeline": [], + "lesson_timeline": [ + { + "duration": 12748, + "dt": 1586323199012, + "code": "LESSON_START", + "type": "event", + "title": "\u4e0a\u8bfe\u5566\uff01" + }, + ], + "hidden_status": false + }, + "success": true +} # --- --- --- Section Download Segment --- --- --- # # { diff --git a/video_processing.py b/video_processing.py index 4316e13..5efbb2c 100644 --- a/video_processing.py +++ b/video_processing.py @@ -6,34 +6,78 @@ print(f"Downloading {name_prefix} - {order}") - video_download_command = (f"aria2c -o '{CACHE_FOLDER}/{name_prefix}-{order}.mp4'" + video_download_command = (f".\\aria2c -o '{CACHE_FOLDER}/{name_prefix}-{order}.mp4'" f" -x 16 -s 16 '{url}' -c -l aria2c_video.log --log-level warn") result = subprocess.run(['powershell', '-Command', video_download_command], text=True) return result +def download_segment_m3u8(CACHE_FOLDER, url: str, order: int, name_prefix: str = "" ): -def download_segments_in_parallel(CACHE_FOLDER, lesson_video_data, name_prefix): + print(f"Downloading {name_prefix} - {order}") + + video_download_command = (f".\\ffmpeg -i '{url}' -c copy -n '{CACHE_FOLDER}/{name_prefix}-{order}.mp4'" + f" -hide_banner -loglevel error -stats") + + result = subprocess.run(['powershell', '-Command', video_download_command], text=True) + + return result + + +def download_segments_in_parallel(fallback_flag, CACHE_FOLDER, lesson_video_data, name_prefix): has_error = False - # Create a ThreadPoolExecutor to manage parallel downloads - with ThreadPoolExecutor(max_workers=6) as executor: - # Dictionary to hold future results - future_to_order = { - executor.submit(download_segment,CACHE_FOLDER, segment['url'], order, name_prefix): order - for order, segment in enumerate(lesson_video_data['data']['live']) - } + if fallback_flag: + # Create a ThreadPoolExecutor to manage parallel downloads + with ThreadPoolExecutor(max_workers=6) as executor: + # Dictionary to hold future results + if 'm3u8' in lesson_video_data['data']['live_timeline'][0]['replay_url']: + future_to_order = { + executor.submit(download_segment_m3u8, CACHE_FOLDER, segment['replay_url'], order, name_prefix): order + for order, segment in enumerate(lesson_video_data['data']['live_timeline']) + } + else: + future_to_order = { + executor.submit(download_segment, CACHE_FOLDER, segment['replay_url'], order, name_prefix): order + for order, segment in enumerate(lesson_video_data['data']['live_timeline']) + } - # Iterate over the completed futures - for future in as_completed(future_to_order): - order = future_to_order[future] - try: - future.result() # Get the result (will raise exception if there was one) - print(f"Successfully downloaded {name_prefix} - {order}") - except Exception as e: - print(e) - print(f"Failed to download {name_prefix} - {order}", file=sys.stderr) - has_error = True + # Iterate over the completed futures + for future in as_completed(future_to_order): + order = future_to_order[future] + try: + future.result() # Get the result (will raise exception if there was one) + print(f"Successfully downloaded {name_prefix} - {order}") + except Exception as e: + print(e) + print(f"Failed to download {name_prefix} - {order}", file=sys.stderr) + has_error = True + + else: + # Create a ThreadPoolExecutor to manage parallel downloads + with ThreadPoolExecutor(max_workers=6) as executor: + # Dictionary to hold future results + if 'm3u8' in lesson_video_data['data']['live'][0]['url']: + future_to_order = { + executor.submit(download_segment_m3u8, CACHE_FOLDER, segment['url'], order, name_prefix): order + for order, segment in enumerate(lesson_video_data['data']['live']) + } + else: + future_to_order = { + executor.submit(download_segment, CACHE_FOLDER, segment['url'], order, name_prefix): order + for order, segment in enumerate(lesson_video_data['data']['live']) + } + + # Iterate over the completed futures + for future in as_completed(future_to_order): + order = future_to_order[future] + try: + future.result() # Get the result (will raise exception if there was one) + print(f"Successfully downloaded {name_prefix} - {order}") + except Exception as e: + print(e) + print(f"Failed to download {name_prefix} - {order}", file=sys.stderr) + has_error = True return has_error