|
@@ -0,0 +1,348 @@
|
|
|
+{
|
|
|
+ "cells": [
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 1,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stderr",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "/mnt/vos-s9gjtkm2/reid/miniconda3/envs/groupvit/lib/python3.7/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
|
|
+ " from .autonotebook import tqdm as notebook_tqdm\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "import pandas as pd\n",
|
|
|
+ "import os\n",
|
|
|
+ "import json\n",
|
|
|
+ "import webdataset as wds\n",
|
|
|
+ "import re\n",
|
|
|
+ "import random"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 2,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "/mnt/vos-s9gjtkm2/reid/dataset/cross_reid\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "current_path = os.getcwd()\n",
|
|
|
+ "print(current_path)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 3,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "CUHK_PEDES_path = os.path.join(current_path, 'CUHK-PEDES')\n",
|
|
|
+ "annotation_path = os.path.join(CUHK_PEDES_path, 'processed_data')\n",
|
|
|
+ "image_path = os.path.join(CUHK_PEDES_path, 'imgs')\n",
|
|
|
+ "train_json_path = os.path.join(annotation_path, 'train.json')\n",
|
|
|
+ "val_json_path = os.path.join(annotation_path, 'val.json')\n",
|
|
|
+ "test_json_path = os.path.join(annotation_path, 'test.json')\n",
|
|
|
+ "base = os.path.join(current_path, 'CUHK-PEDES_shards')"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 4,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ " split captions \\\n",
|
|
|
+ "0 val [The man has short, dark hair and wears khaki ... \n",
|
|
|
+ "1 val [A man with a gray hoodie, book bag, and khaki... \n",
|
|
|
+ "2 val [The man is wearing a grey hooded sweater, bro... \n",
|
|
|
+ "3 val [Man wearing a grey jacket, brown pants and bl... \n",
|
|
|
+ "4 val [The woman is wearing a floral printed shirt w... \n",
|
|
|
+ "\n",
|
|
|
+ " file_path \\\n",
|
|
|
+ "0 CUHK01/0107002.png \n",
|
|
|
+ "1 CUHK01/0107004.png \n",
|
|
|
+ "2 CUHK01/0107001.png \n",
|
|
|
+ "3 CUHK01/0107003.png \n",
|
|
|
+ "4 test_query/p5969_s7727.jpg \n",
|
|
|
+ "\n",
|
|
|
+ " processed_tokens id \n",
|
|
|
+ "0 [[the, man, has, short, dark, hair, and, wears... 11004 \n",
|
|
|
+ "1 [[a, man, with, a, gray, hoodie, book, bag, an... 11004 \n",
|
|
|
+ "2 [[the, man, is, wearing, a, grey, hooded, swea... 11004 \n",
|
|
|
+ "3 [[man, wearing, a, grey, jacket, brown, pants,... 11004 \n",
|
|
|
+ "4 [[the, woman, is, wearing, a, floral, printed,... 11005 \n",
|
|
|
+ "3078\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "with open(val_json_path, 'r') as file:\n",
|
|
|
+ " data = json.load(file)\n",
|
|
|
+ "\n",
|
|
|
+ "train_json = pd.DataFrame(data)\n",
|
|
|
+ "print(train_json.head())\n",
|
|
|
+ "print(train_json.shape[0])"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 5,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 定义一个函数来替换人称代词为对应的名词,并将名词替换为 {id}\n",
|
|
|
+ "def replace_pronouns_and_nouns(sentence, id):\n",
|
|
|
+ " replacements = {\n",
|
|
|
+ " r'\\bmale\\b': f'male_{id}',\n",
|
|
|
+ " r'\\bman\\b': f'male_{id}',\n",
|
|
|
+ " r'\\bmans\\b': f'male_{id}',\n",
|
|
|
+ " r'\\bhe\\b': f'male_{id}',\n",
|
|
|
+ " r'\\bboy\\b': f'male_{id}',\n",
|
|
|
+ " r'\\bgentleman\\b': f'male_{id}',\n",
|
|
|
+ " r'\\bguy\\b':f'male_{id}',\n",
|
|
|
+ " r'\\bfemale\\b': f'female_{id}',\n",
|
|
|
+ " r'\\bwoman\\b': f'female_{id}',\n",
|
|
|
+ " r'\\bwomen\\b': f'female_{id}',\n",
|
|
|
+ " r'\\bshe\\b': f'female_{id}',\n",
|
|
|
+ " r'\\bgirl\\b': f'female_{id}',\n",
|
|
|
+ " r'\\bgirls\\b': f'female_{id}',\n",
|
|
|
+ " r'\\blady\\b': f'female_{id}',\n",
|
|
|
+ " r'\\bcheerleader\\b': f'female_{id}',\n",
|
|
|
+ " r'\\bperson\\b':f'person_{id}',\n",
|
|
|
+ " r'\\bi\\b':f'person_{id}',\n",
|
|
|
+ " r'\\byou\\b':f'person_{id}',\n",
|
|
|
+ " r'\\bbaby\\b':f'person_{id}',\n",
|
|
|
+ " r'\\bchild\\b':f'person_{id}',\n",
|
|
|
+ " r'\\badult\\b':f'person_{id}',\n",
|
|
|
+ " r'\\bpedestrian\\b':f'person_{id}',\n",
|
|
|
+ " r'\\bunknown gender\\b':f'person_{id}',\n",
|
|
|
+ " r'\\bunknown subject\\b':f'person_{id}',\n",
|
|
|
+ " r'\\bwe\\b': f'people_{id}',\n",
|
|
|
+ " r'\\bthey\\b': f'people_{id}',\n",
|
|
|
+ " r'\\bpeople\\b': f'people_{id}'\n",
|
|
|
+ " }\n",
|
|
|
+ " for pattern, replacement in replacements.items():\n",
|
|
|
+ " sentence = re.sub(pattern, replacement, sentence)\n",
|
|
|
+ " return sentence"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 6,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 创建一个具有预定义列的空 DataFrame\n",
|
|
|
+ "columns = ['file_path', 'caption', 'id']\n",
|
|
|
+ "preprocess_df = pd.DataFrame(columns=columns)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 7,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 遍历 data 中的每一条记录\n",
|
|
|
+ "for index, row in train_json.iterrows():\n",
|
|
|
+ " id = row['id']\n",
|
|
|
+ " file_path = row['file_path']\n",
|
|
|
+ " caption = row['captions']\n",
|
|
|
+ "\n",
|
|
|
+ " # 确保 captions 是一个字符串并转换为小写\n",
|
|
|
+ " if isinstance(caption, list):\n",
|
|
|
+ " caption = ' '.join(caption).lower()\n",
|
|
|
+ " else:\n",
|
|
|
+ " caption = caption.lower()\n",
|
|
|
+ "\n",
|
|
|
+ " # 替换人称代词和名词\n",
|
|
|
+ " replaced_caption = replace_pronouns_and_nouns(caption, id)\n",
|
|
|
+ "\n",
|
|
|
+ " # 提取 [人物_{id}] 和匹配 TOP_CLASSES_1 中的实体\n",
|
|
|
+ " entities = []\n",
|
|
|
+ "\n",
|
|
|
+ " # 提取所有替换后的人称代词和名词\n",
|
|
|
+ " person_patterns = [\n",
|
|
|
+ " re.compile(r'\\bmale_\\d+\\b'),\n",
|
|
|
+ " re.compile(r'\\bfemale_\\d+\\b'),\n",
|
|
|
+ " re.compile(r'\\bperson_\\d+\\b'),\n",
|
|
|
+ " re.compile(r'\\bpeople_\\d+\\b')\n",
|
|
|
+ " ]\n",
|
|
|
+ " \n",
|
|
|
+ " # 检查是否有替换后的人称代词或名词\n",
|
|
|
+ " if not any(pattern.search(replaced_caption) for pattern in person_patterns):\n",
|
|
|
+ " print(f\"No replacement in sentence: {id}\")\n",
|
|
|
+ "\n",
|
|
|
+ " # 将结果添加到 preprocess_df 中\n",
|
|
|
+ " new_row = pd.DataFrame({'file_path': [file_path], 'caption': [replaced_caption], 'id': [id]})\n",
|
|
|
+ " preprocess_df = pd.concat([preprocess_df, new_row], ignore_index=True)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 8,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ " file_path \\\n",
|
|
|
+ "0 CUHK01/0107002.png \n",
|
|
|
+ "1 CUHK01/0107004.png \n",
|
|
|
+ "2 CUHK01/0107001.png \n",
|
|
|
+ "3 CUHK01/0107003.png \n",
|
|
|
+ "4 test_query/p5969_s7727.jpg \n",
|
|
|
+ "\n",
|
|
|
+ " caption id \n",
|
|
|
+ "0 the male_11004 has short, dark hair and wears ... 11004 \n",
|
|
|
+ "1 a male_11004 with a gray hoodie, book bag, and... 11004 \n",
|
|
|
+ "2 the male_11004 is wearing a grey hooded sweate... 11004 \n",
|
|
|
+ "3 male_11004 wearing a grey jacket, brown pants ... 11004 \n",
|
|
|
+ "4 the female_11005 is wearing a floral printed s... 11005 \n",
|
|
|
+ "the male_11004 has short, dark hair and wears khaki pants with an oversized grey hoodie. his black backpack hangs from one shoulder. a male_11004 wearing a gray, hooded jacket, a pair of wrinkled brown pants, a gray backpack and a pair of dark colored shoes.\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "print(preprocess_df.head())\n",
|
|
|
+ "print(preprocess_df.at[0, 'caption'])"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 9,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "nimages = preprocess_df.shape[0]\n",
|
|
|
+ "indexes = list(range(nimages))\n",
|
|
|
+ "random.shuffle(indexes)\n",
|
|
|
+ "\n",
|
|
|
+ "# pattern = os.path.join(base, f\"cuhkpedes-train-%06d.tar\")\n",
|
|
|
+ "pattern = os.path.join(base, f\"cuhkpedes-val-%06d.tar\")"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 10,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000000.tar 0 0.0 GB 0\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000001.tar 133 0.0 GB 133\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000002.tar 148 0.0 GB 281\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000003.tar 139 0.0 GB 420\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000004.tar 134 0.0 GB 554\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000005.tar 123 0.0 GB 677\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000006.tar 136 0.0 GB 813\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000007.tar 126 0.0 GB 939\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000008.tar 136 0.0 GB 1075\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000009.tar 151 0.0 GB 1226\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000010.tar 146 0.0 GB 1372\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000011.tar 143 0.0 GB 1515\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000012.tar 146 0.0 GB 1661\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000013.tar 127 0.0 GB 1788\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000014.tar 145 0.0 GB 1933\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000015.tar 135 0.0 GB 2068\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000016.tar 133 0.0 GB 2201\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000017.tar 121 0.0 GB 2322\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000018.tar 120 0.0 GB 2442\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000019.tar 128 0.0 GB 2570\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000020.tar 124 0.0 GB 2694\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000021.tar 115 0.0 GB 2809\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000022.tar 138 0.0 GB 2947\n",
|
|
|
+ "# writing /mnt/vos-s9gjtkm2/reid/dataset/cross_reid/CUHK-PEDES_shards/cuhkpedes-val-000023.tar 128 0.0 GB 3075\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "def readfile(fname):\n",
|
|
|
+ " \"Read a binary file from disk.\"\n",
|
|
|
+ " with open(fname, \"rb\") as stream:\n",
|
|
|
+ " return stream.read()\n",
|
|
|
+ " \n",
|
|
|
+ "all_keys = set()\n",
|
|
|
+ "\n",
|
|
|
+ "with wds.ShardWriter(pattern, maxsize=1000000, maxcount=1000000) as sink:\n",
|
|
|
+ " for i in indexes:\n",
|
|
|
+ "\n",
|
|
|
+ " # Internal information from the ImageNet dataset\n",
|
|
|
+ " # instance: the file name and the numerical class.\n",
|
|
|
+ " fname = preprocess_df.at[i, 'file_path']\n",
|
|
|
+ " caption = preprocess_df.at[i, 'caption']\n",
|
|
|
+ " id = preprocess_df.at[i, 'id']\n",
|
|
|
+ " fname = os.path.join(image_path, fname)\n",
|
|
|
+ "\n",
|
|
|
+ " # Read the JPEG-compressed image file contents.\n",
|
|
|
+ " image = readfile(fname)\n",
|
|
|
+ "\n",
|
|
|
+ " # Construct a uniqu keye from the filename.\n",
|
|
|
+ " base_dir = os.path.dirname(fname)\n",
|
|
|
+ " dir_name = os.path.basename(base_dir)\n",
|
|
|
+ " key = os.path.splitext(os.path.basename(fname))[0]\n",
|
|
|
+ " key = f\"{dir_name}_{key}\"\n",
|
|
|
+ "\n",
|
|
|
+ " # Useful check.\n",
|
|
|
+ " assert key not in all_keys, f\"Conflict detected: Key '{key}' already exists.\"\n",
|
|
|
+ " all_keys.add(key)\n",
|
|
|
+ "\n",
|
|
|
+ " # Construct the cls field with the new format.\n",
|
|
|
+ " cls = f\"4 4 1\\n# male_{id} female_{id} person_{id} people_{id}\\n0 1 2 3\" \n",
|
|
|
+ "\n",
|
|
|
+ " # Construct a sample.\n",
|
|
|
+ " xkey = key if True else \"%07d\" % i\n",
|
|
|
+ " sample = {\"__key__\": xkey, \"jpg\": image, \"cls\": cls}\n",
|
|
|
+ " # sample = {\"__key__\": xkey, \"jpg\": image, \"txt\": caption}\n",
|
|
|
+ "\n",
|
|
|
+ " # Write the sample to the sharded tar archives.\n",
|
|
|
+ " sink.write(sample)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": []
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "metadata": {
|
|
|
+ "kernelspec": {
|
|
|
+ "display_name": "groupvit",
|
|
|
+ "language": "python",
|
|
|
+ "name": "python3"
|
|
|
+ },
|
|
|
+ "language_info": {
|
|
|
+ "codemirror_mode": {
|
|
|
+ "name": "ipython",
|
|
|
+ "version": 3
|
|
|
+ },
|
|
|
+ "file_extension": ".py",
|
|
|
+ "mimetype": "text/x-python",
|
|
|
+ "name": "python",
|
|
|
+ "nbconvert_exporter": "python",
|
|
|
+ "pygments_lexer": "ipython3",
|
|
|
+ "version": "3.7.12"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "nbformat": 4,
|
|
|
+ "nbformat_minor": 2
|
|
|
+}
|