{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/root/miniconda3/envs/groupvit/lib/python3.7/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "import pandas as pd\n", "import os\n", "import json\n", "import webdataset as wds\n", "import re\n", "import random" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/root/dataset\n" ] } ], "source": [ "home_dir = os.path.expanduser('~')\n", "dataset_path = os.path.join(home_dir, 'dataset')\n", "print(dataset_path)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "CUHK_PEDES_path = os.path.join(dataset_path, 'CUHK-PEDES')\n", "annotation_path = os.path.join(CUHK_PEDES_path, 'processed_data')\n", "image_path = os.path.join(CUHK_PEDES_path, 'imgs')\n", "train_json_path = os.path.join(annotation_path, 'train.json')\n", "val_json_path = os.path.join(annotation_path, 'val.json')\n", "test_json_path = os.path.join(annotation_path, 'test.json')\n", "reid_raw_file = os.path.join(CUHK_PEDES_path, 'reid_raw.json')\n", "base = os.path.join(dataset_path, 'CUHK-PEDES_shards')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " split captions \\\n", "0 train [A pedestrian with dark hair is wearing red an... \n", "1 train [A man wearing a black jacket, black pants, re... \n", "2 train [The man is wearing a black jacket, green jean... \n", "3 train [He's wearing a black hooded sweatshirt with a... \n", "4 train [The man is walking. He is wearing a bright g... \n", "\n", " file_path \\\n", "0 CUHK01/0363004.png \n", "1 CUHK01/0363003.png \n", "2 CUHK01/0363001.png \n", "3 CUHK01/0363002.png \n", "4 train_query/p8130_s10935.jpg \n", "\n", " processed_tokens id \n", "0 [[a, pedestrian, with, dark, hair, is, wearing... 1 \n", "1 [[a, man, wearing, a, black, jacket, black, pa... 1 \n", "2 [[the, man, is, wearing, a, black, jacket, gre... 1 \n", "3 [[hes, wearing, a, black, hooded, sweatshirt, ... 1 \n", "4 [[the, man, is, walking, he, is, wearing, a, b... 2 \n", "40206\n" ] } ], "source": [ "flag = \"None\"\n", "if os.path.exists(val_json_path) & os.path.exists(train_json_path) & os.path.exists(test_json_path):\n", " with open(train_json_path, 'r') as file:\n", " train_json = json.load(file)\n", " with open(test_json_path, 'r') as file:\n", " test_json = json.load(file)\n", " with open(val_json_path, 'r') as file:\n", " val_json = json.load(file)\n", " train_data = pd.DataFrame(train_json)\n", " test_data = pd.DataFrame(test_json)\n", " val_data = pd.DataFrame(val_json)\n", " print(train_data.head())\n", " print(train_data.shape[0])\n", " flag = \"ttv\"\n", "elif os.path.exists(reid_raw_file):\n", " with open(reid_raw_file, 'r') as file:\n", " reid_json = json.load(file)\n", " reid_data = pd.DataFrame(reid_json)\n", " print(reid_data.head())\n", " print(reid_data.shape[0])\n", " flag = \"raw\"\n", "else: raise FileNotFoundError" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# 创建一个具有预定义列的空 DataFrame\n", "columns = ['file_path', 'captions', 'id']\n", "processed_train_data = pd.DataFrame(columns=columns)\n", "processed_test_data = pd.DataFrame(columns=columns)\n", "processed_val_data = pd.DataFrame(columns=columns)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# 遍历数据集并更新 processed_data\n", "if flag == 'ttv':\n", " processed_train_data = train_data[['file_path', 'captions', 'id']]\n", " # for index, row in train_data.iterrows():\n", " # id = row['id']\n", " # file_path = row['file_path']\n", " # captions = row['captions']\n", "\n", " # # 确保 captions 是一个字符串并转换为小写\n", " # if isinstance(captions, list):\n", " # captions = ' '.join(captions).lower()\n", " # else:\n", " # captions = captions.lower()\n", " \n", " # # 将结果添加到 processed_data 中\n", " # new_row = pd.DataFrame({'file_path': [file_path], 'captions': [captions], 'id': [id]})\n", " # processed_train_data = pd.concat([processed_train_data, new_row], ignore_index=True)\n", " processed_test_data = test_data[['file_path', 'captions', 'id']]\n", " # for index, row in test_data.iterrows():\n", " # id = row['id']\n", " # file_path = row['file_path']\n", " # captions = row['captions']\n", "\n", " # # 确保 captions 是一个字符串并转换为小写\n", " # if isinstance(captions, list):\n", " # captions = ' '.join(captions).lower()\n", " # else:\n", " # captions = captions.lower()\n", " \n", " # # 将结果添加到 processed_data 中\n", " # new_row = pd.DataFrame({'file_path': [file_path], 'captions': [captions], 'id': [id]})\n", " # processed_test_data = pd.concat([processed_test_data, new_row], ignore_index=True)\n", " processed_val_data = val_data[['file_path', 'captions', 'id']]\n", " # for index, row in val_data.iterrows():\n", " # id = row['id']\n", " # file_path = row['file_path']\n", " # captions = row['captions']\n", "\n", " # # 确保 captions 是一个字符串并转换为小写\n", " # if isinstance(captions, list):\n", " # captions = ' '.join(captions).lower()\n", " # else:\n", " # captions = captions.lower()\n", " \n", " # # 将结果添加到 processed_data 中\n", " # new_row = pd.DataFrame({'file_path': [file_path], 'captions': [captions], 'id': [id]})\n", " # processed_val_data = pd.concat([processed_val_data, new_row], ignore_index=True)\n", " \n", "elif flag == 'raw':\n", " processed_train_data = reid_data.loc[reid_data['split'] == 'train']\n", " processed_test_data = reid_data.loc[reid_data['split'] == 'test']\n", " processed_val_data = reid_data.loc[reid_data['split'] == 'val']\n", " processed_train_data = processed_train_data[['file_path', 'captions', 'id']]\n", " processed_test_data = processed_test_data[['file_path', 'captions', 'id']]\n", " processed_val_data = processed_val_data[['file_path', 'captions', 'id']]\n", " # for index, row in reid_data.iterrows():\n", " # id = row['id']\n", " # file_path = row['file_path']\n", " # captions = row['captions']\n", "\n", " # # 确保 captions 是一个字符串并转换为小写\n", " # if isinstance(captions, list):\n", " # captions = ' '.join(captions).lower()\n", " # else:\n", " # captions = captions.lower()\n", " \n", " # new_row = pd.DataFrame({'file_path': [file_path], 'captions': [captions], 'id': [id]})\n", " # # 将结果添加到processed_data 中\n", " # if row['split'] == 'train':\n", " # processed_train_data = pd.concat([processed_train_data, new_row], ignore_index=True)\n", " # elif row['split'] == 'test':\n", " # processed_test_data = pd.concat([processed_test_data, new_row], ignore_index=True)\n", " # elif row['split'] == 'val':\n", " # processed_val_data = pd.concat([processed_val_data, new_row], ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " file_path \\\n", "0 CUHK01/0363004.png \n", "1 CUHK01/0363003.png \n", "2 CUHK01/0363001.png \n", "3 CUHK01/0363002.png \n", "4 train_query/p8130_s10935.jpg \n", "\n", " captions id \n", "0 a pedestrian with dark hair is wearing red and... 1 \n", "1 a man wearing a black jacket, black pants, red... 1 \n", "2 the man is wearing a black jacket, green jeans... 1 \n", "3 he's wearing a black hooded sweatshirt with a ... 1 \n", "4 the man is walking. he is wearing a bright gr... 2 \n", " file_path \\\n", "0 train_query/p8848_s17661.jpg \n", "1 train_query/p8848_s17662.jpg \n", "2 train_query/p8848_s17663.jpg \n", "3 train_query/p4327_s5502.jpg \n", "4 train_query/p4327_s5503.jpg \n", "\n", " captions id \n", "0 a man wearing a blue and white stripe tank top... 12004 \n", "1 a man wearing a white and gray stripe shirt, a... 12004 \n", "2 the man is wearing green pants and a green and... 12004 \n", "3 a person is carrying a black shoulder bag over... 12005 \n", "4 young man with dark hair and glasses, dark and... 12005 \n", " file_path \\\n", "0 CUHK01/0107002.png \n", "1 CUHK01/0107004.png \n", "2 CUHK01/0107001.png \n", "3 CUHK01/0107003.png \n", "4 test_query/p5969_s7727.jpg \n", "\n", " captions id \n", "0 the man has short, dark hair and wears khaki p... 11004 \n", "1 a man with a gray hoodie, book bag, and khaki ... 11004 \n", "2 the man is wearing a grey hooded sweater, brow... 11004 \n", "3 man wearing a grey jacket, brown pants and bla... 11004 \n", "4 the woman is wearing a floral printed shirt wi... 11005 \n" ] } ], "source": [ "# 定义一个函数,将列表中的字符串转换为小写并合并\n", "def process_captions(captions_list):\n", " if isinstance(captions_list, list):\n", " return ' '.join([caption.lower() for caption in captions_list])\n", " else:\n", " return captions_list.lower()\n", " \n", "processed_train_data['captions'] = processed_train_data['captions'].apply(process_captions)\n", "processed_test_data['captions'] = processed_test_data['captions'].apply(process_captions)\n", "processed_val_data['captions'] = processed_val_data['captions'].apply(process_captions)\n", "\n", "processed_train_data.reset_index(drop=True, inplace=True)\n", "processed_test_data.reset_index(drop=True, inplace=True)\n", "processed_val_data.reset_index(drop=True, inplace=True)\n", "\n", "print(processed_train_data.head())\n", "print(processed_test_data.head())\n", "print(processed_val_data.head())" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "train_images = processed_train_data.shape[0]\n", "train_indexes = list(range(train_images))\n", "random.shuffle(train_indexes)\n", "\n", "test_images = processed_test_data.shape[0]\n", "test_indexes = list(range(test_images))\n", "random.shuffle(test_indexes)\n", "\n", "val_images = processed_val_data.shape[0]\n", "val_indexes = list(range(val_images))\n", "random.shuffle(val_indexes)\n", "\n", "train_pattern = os.path.join(base, f\"cuhkpedes-train-%06d.tar\")\n", "test_pattern = os.path.join(base, f\"cuhkpedes-test-%06d.tar\")\n", "val_pattern = os.path.join(base, f\"cuhkpedes-val-%06d.tar\")" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "def readfile(fname):\n", " \"Read a binary file from disk.\"\n", " with open(fname, \"rb\") as stream:\n", " return stream.read()\n", "\n", "train_keys = set()\n", "test_keys = set()\n", "val_keys = set()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "# writing /root/dataset/CUHK-PEDES_shards/cuhkpedes-train-000000.tar 0 0.0 GB 0\n", "# writing /root/dataset/CUHK-PEDES_shards/cuhkpedes-train-000001.tar 8060 0.1 GB 8060\n", "# writing /root/dataset/CUHK-PEDES_shards/cuhkpedes-train-000002.tar 8010 0.1 GB 16070\n", "# writing /root/dataset/CUHK-PEDES_shards/cuhkpedes-train-000003.tar 7924 0.1 GB 23994\n", "# writing /root/dataset/CUHK-PEDES_shards/cuhkpedes-train-000004.tar 7933 0.1 GB 31927\n", "# writing /root/dataset/CUHK-PEDES_shards/cuhkpedes-test-000000.tar 0 0.0 GB 0\n", "# writing /root/dataset/CUHK-PEDES_shards/cuhkpedes-val-000000.tar 0 0.0 GB 0\n" ] } ], "source": [ "def write_to_tar(processed_data, image_path, indexes, all_keys, pattern, maxcount=10000, maxsize=6e7):\n", " \n", " output_dir = os.path.dirname(pattern)\n", " os.makedirs(output_dir, exist_ok=True)\n", " with wds.ShardWriter(pattern, maxcount, maxsize) as sink:\n", " for i in indexes:\n", " # instance: the file name and the numerical class.\n", " fname = processed_data.at[i, 'file_path']\n", " captions = processed_data.at[i, 'captions']\n", " id = processed_data.at[i, 'id']\n", " fname = os.path.join(image_path, fname)\n", "\n", " # Read the JPEG-compressed image file contents.\n", " image = readfile(fname)\n", "\n", " # Construct a uniqu keye from the filename.\n", " # base_dir = os.path.dirname(fname)\n", " # dir_name = os.path.basename(base_dir)\n", " # key = os.path.splitext(os.path.basename(fname))[0]\n", " key = f\"{id}_{i}\"\n", "\n", " # Useful check.\n", " assert key not in all_keys, f\"Conflict detected: Key '{key}' already exists.\"\n", " all_keys.add(key) \n", "\n", " # Construct a sample.\n", " xkey = key if True else \"%07d\" % i\n", " sample = {\"__key__\": xkey, \"jpg\": image, \"txt\": captions}\n", "\n", " # Write the sample to the sharded tar archives.\n", " sink.write(sample)\n", "\n", "\n", "write_to_tar(processed_train_data, image_path, train_indexes, train_keys, train_pattern)\n", "write_to_tar(processed_test_data, image_path, test_indexes, test_keys, test_pattern)\n", "write_to_tar(processed_val_data, image_path, val_indexes, val_keys, val_pattern)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" } }, "nbformat": 4, "nbformat_minor": 2 }