Spaces:
Running on Zero
Running on Zero
| #!/usr/bin/env python3 | |
| """ | |
| Preprocess Context-as-Memory dataset folders into Echo-Memory metadata CSV. | |
| Expected dataset layout: | |
| - frames/: frame images organized by video | |
| - jsons/: camera pose information for each video | |
| - overlap_labels/: FOV overlap information for memory retrieval | |
| - captions.txt: video segment captions | |
| """ | |
| import argparse | |
| import csv | |
| import json | |
| import os | |
| from typing import Dict, List, Tuple | |
| def parse_caption_line(line: str) -> Tuple[str, str]: | |
| """ | |
| Parse a line from captions.txt. | |
| Format: "video_name/start_end.mp4\tcaption text..." | |
| Returns: (video_path, caption) | |
| """ | |
| parts = line.strip().split("\t", 1) | |
| if len(parts) != 2: | |
| return None, None | |
| video_path = parts[0] | |
| caption = parts[1] | |
| return video_path, caption | |
| def load_captions(captions_file: str) -> Dict[str, str]: | |
| """Load captions.txt as video_name -> caption.""" | |
| captions = {} | |
| if not os.path.exists(captions_file): | |
| print(f"Warning: Captions file not found: {captions_file}") | |
| return captions | |
| with open(captions_file, "r", encoding="utf-8") as f: | |
| for line in f: | |
| video_path, caption = parse_caption_line(line) | |
| if video_path and caption: | |
| video_name = video_path.split("/")[0] | |
| if video_name not in captions: | |
| captions[video_name] = [] | |
| captions[video_name].append(caption) | |
| for video_name in captions: | |
| captions[video_name] = captions[video_name][0] if captions[video_name] else "" | |
| return captions | |
| def get_frame_files(frames_dir: str, video_name: str) -> List[str]: | |
| """Get sorted frame paths for one video, relative to frames_dir.""" | |
| video_frames_dir = os.path.join(frames_dir, video_name) | |
| if not os.path.exists(video_frames_dir): | |
| return [] | |
| frame_files = [] | |
| for frame_file in sorted(os.listdir(video_frames_dir)): | |
| if frame_file.endswith(".png"): | |
| frame_files.append(os.path.join(video_name, frame_file)) | |
| return frame_files | |
| def load_camera_poses(json_file: str) -> Dict: | |
| """Load camera poses from a JSON file.""" | |
| if not os.path.exists(json_file): | |
| return {} | |
| with open(json_file, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| if "CineCameraActor" in data: | |
| return data["CineCameraActor"] | |
| if isinstance(data, dict): | |
| return data | |
| return {} | |
| def load_overlap_labels(overlap_dir: str, video_name: str, frame_idx: int) -> List[int]: | |
| """Load overlapping frame indices for a given frame.""" | |
| overlap_file = os.path.join(overlap_dir, video_name, f"{frame_idx}.json") | |
| if not os.path.exists(overlap_file): | |
| return [] | |
| try: | |
| with open(overlap_file, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| overlapping_frames = data.get("overlapping_frames", []) | |
| return [int(frame) for frame in overlapping_frames if str(frame).isdigit()] | |
| except Exception: | |
| return [] | |
| def create_metadata_csv( | |
| dataset_base_path: str, | |
| output_csv: str, | |
| segment_length: int = 81, | |
| context_frames: int = 5, | |
| ): | |
| """ | |
| Create metadata CSV for the Context-as-Memory dataset. | |
| Args: | |
| dataset_base_path: root of the dataset. | |
| output_csv: output CSV path. | |
| segment_length: frames per training segment. | |
| context_frames: context frames reserved by downstream workflows. | |
| """ | |
| frames_dir = os.path.join(dataset_base_path, "frames") | |
| captions_file = os.path.join(dataset_base_path, "captions.txt") | |
| captions = load_captions(captions_file) | |
| if not os.path.exists(frames_dir): | |
| print(f"Error: Frames directory not found: {frames_dir}") | |
| return | |
| video_names = [ | |
| d for d in os.listdir(frames_dir) | |
| if os.path.isdir(os.path.join(frames_dir, d)) | |
| ] | |
| print(f"Found {len(video_names)} videos") | |
| print(f"Context frames: {context_frames}") | |
| output_dir = os.path.dirname(output_csv) | |
| if output_dir: | |
| os.makedirs(output_dir, exist_ok=True) | |
| with open(output_csv, "w", newline="", encoding="utf-8") as csvfile: | |
| fieldnames = [ | |
| "video", | |
| "prompt", | |
| "video_name", | |
| "start_frame", | |
| "end_frame", | |
| ] | |
| writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
| writer.writeheader() | |
| total_segments = 0 | |
| for video_name in sorted(video_names): | |
| print(f"Processing video: {video_name}") | |
| frame_files = get_frame_files(frames_dir, video_name) | |
| if len(frame_files) < segment_length: | |
| print( | |
| f" Skipping {video_name}: only {len(frame_files)} frames " | |
| f"(need at least {segment_length})" | |
| ) | |
| continue | |
| prompt = captions.get(video_name, f"A scene from {video_name}") | |
| step = max(1, segment_length // 2) | |
| video_segments = 0 | |
| for start_idx in range(0, len(frame_files) - segment_length + 1, step): | |
| end_idx = start_idx + segment_length - 1 | |
| segment_frames = frame_files[start_idx:end_idx + 1] | |
| if len(segment_frames) < segment_length: | |
| continue | |
| frame_paths = "|".join(segment_frames) | |
| video_path = os.path.join("frames", frame_paths) | |
| writer.writerow({ | |
| "video": video_path, | |
| "prompt": prompt, | |
| "video_name": video_name, | |
| "start_frame": start_idx, | |
| "end_frame": end_idx, | |
| }) | |
| total_segments += 1 | |
| video_segments += 1 | |
| print(f" Created {video_segments} segments for {video_name}") | |
| print(f"\nTotal segments created: {total_segments}") | |
| print(f"Metadata CSV saved to: {output_csv}") | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Preprocess Context-as-Memory Dataset") | |
| parser.add_argument( | |
| "--dataset_base_path", | |
| type=str, | |
| required=True, | |
| help="Base path to Context-as-Memory dataset", | |
| ) | |
| parser.add_argument( | |
| "--output_csv", | |
| type=str, | |
| default="metadata.csv", | |
| help="Output CSV file path (default: metadata.csv)", | |
| ) | |
| parser.add_argument( | |
| "--segment_length", | |
| type=int, | |
| default=81, | |
| help="Length of video segments (default: 81 frames)", | |
| ) | |
| parser.add_argument( | |
| "--context_frames", | |
| type=int, | |
| default=5, | |
| help="Number of context frames (default: 5)", | |
| ) | |
| args = parser.parse_args() | |
| if not os.path.isabs(args.output_csv): | |
| args.output_csv = os.path.join(args.dataset_base_path, args.output_csv) | |
| create_metadata_csv( | |
| dataset_base_path=args.dataset_base_path, | |
| output_csv=args.output_csv, | |
| segment_length=args.segment_length, | |
| context_frames=args.context_frames, | |
| ) | |
| if __name__ == "__main__": | |
| main() | |