Skip to content

Commit

Permalink
ignore submission and show processing
Browse files Browse the repository at this point in the history
  • Loading branch information
qew21 authored and you-n-g committed Jan 9, 2025
1 parent c1c9f93 commit 3096cf5
Showing 1 changed file with 21 additions and 6 deletions.
27 changes: 21 additions & 6 deletions rdagent/scenarios/data_science/debug/data.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from collections import Counter
import os
import platform
import shutil
from pathlib import Path

import pandas as pd

from tqdm import tqdm

try:
import bson # pip install pymongo
except:
Expand Down Expand Up @@ -99,7 +102,7 @@ def __init__(self, min_frac=0.02, min_num=5):

def reduce(self, df: pd.DataFrame) -> pd.DataFrame:
frac = max(self.min_frac, self.min_num / len(df))
print(f"Sampling {frac * 100:.2f}% of the data ({len(df)} rows)")
# print(f"Sampling {frac * 100:.2f}% of the data ({len(df)} rows)")
if frac >= 1:
return df
return df.sample(frac=frac, random_state=1)
Expand Down Expand Up @@ -137,11 +140,14 @@ def create_debug_data(
# Traverse the folder and exclude specific file types
included_extensions = {".csv", ".pkl", ".parquet", ".h5", ".hdf", ".hdf5", ".jsonl", ".bson"}
files_to_process = [file for file in data_folder.rglob("*") if file.is_file()]

total_files_count = len(files_to_process)
print(
f"[INFO] Original dataset folder `{data_folder}` has {total_files_count} files in total (including subfolders)."
)
file_types_count = Counter(file.suffix.lower() for file in files_to_process)
print("File type counts:")
for file_type, count in file_types_count.items():
print(f"{file_type}: {count}")

# This set will store filenames or paths that appear in the sampled data
sample_used_file_names = set()
Expand All @@ -150,26 +156,34 @@ def create_debug_data(
data_handler = GenericDataHandler()
data_reducer = dr_cls(min_frac=min_frac, min_num=min_num)

for file_path in files_to_process:
skip_subfolder_data = any(f.is_file() and f.suffix in included_extensions for f in data_folder.iterdir() if f.name.startswith(("train", "test")))
processed_files = []

for file_path in tqdm(files_to_process, desc="Processing data", unit="file"):
sampled_file_path = sample_folder / file_path.relative_to(data_folder)
if sampled_file_path.exists():
continue

if file_path.suffix.lower() not in included_extensions:
continue

if skip_subfolder_data and file_path.parent != data_folder:
continue # bypass files in subfolders

sampled_file_path.parent.mkdir(parents=True, exist_ok=True)

# Load the original data
df = data_handler.load(file_path)

# Create a sampled subset
df_sampled = data_reducer.reduce(df)

processed_files.append(file_path)
# Dump the sampled data
try:
data_handler.dump(df_sampled, sampled_file_path)
# Extract possible file references from the sampled data
if "submission" in file_path.stem:
continue # Skip submission files
for col in df_sampled.columns:
unique_vals = df_sampled[col].astype(str).unique()
for val in unique_vals:
Expand All @@ -183,13 +197,13 @@ def create_debug_data(
# Process non-data files
subfolder_dict = {}
for file_path in files_to_process:
if file_path.suffix.lower() in included_extensions:
if file_path in processed_files:
continue # Already handled above
rel_dir = file_path.relative_to(data_folder).parts[0]
subfolder_dict.setdefault(rel_dir, []).append(file_path)

# For each subfolder, decide which files to copy
for rel_dir, file_list in subfolder_dict.items():
for rel_dir, file_list in tqdm(subfolder_dict.items(), desc="Processing files", unit="file"):
used_files = []
not_used_files = []

Expand Down Expand Up @@ -223,5 +237,6 @@ def create_debug_data(
sampled_file_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy(nf, sampled_file_path)


final_files_count = count_files_in_folder(sample_folder)
print(f"[INFO] After sampling, the sample folder `{sample_folder}` contains {final_files_count} files in total.")

0 comments on commit 3096cf5

Please sign in to comment.