1 ヶ月前 · 1c9aafb325
--- a/cuhkpedes/cuhkpedes_entity_add.ipynb
+++ b/cuhkpedes/cuhkpedes_entity_add.ipynb
@@ -0,0 +1,257 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# 添加cuhkpedes每句的实体"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 1,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import os\n",
			
 
				+    "import json\n",
			
 
				+    "import collections\n",
			
 
				+    "import string\n",
			
 
				+    "import pandas as pd\n",
			
 
				+    "import numpy as np\n",
			
 
				+    "import nltk\n",
			
 
				+    "from nltk.tokenize import *"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# 确保你已经下载了 NLTK 的 punkt 数据"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 2,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "punkt tokenizer models are already downloaded.\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "try:\n",
			
 
				+    "    sent_tokenize(\"This is a test sentence.\")\n",
			
 
				+    "    print(\"punkt tokenizer models are already downloaded.\")\n",
			
 
				+    "except LookupError:\n",
			
 
				+    "    print(\"punkt tokenizer models are not downloaded.\")\n",
			
 
				+    "    nltk.download('punkt')"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# 设置数据集路径"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# 获取主目录\n",
			
 
				+    "home_directory = os.path.expanduser('~')\n",
			
 
				+    "dataset_path = os.path.join(home_directory, 'dataset/cross_reid/CUHK-PEDES')\n",
			
 
				+    "class_file = os.path.join(dataset_path, 'class.json')\n",
			
 
				+    "raw_file = os.path.join(dataset_path, 'reid_raw.json')"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# 读取 JSON 文件"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 4,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "with open(class_file, 'r') as file:\n",
			
 
				+    "    class_data = json.load(file)\n",
			
 
				+    "\n",
			
 
				+    "class_df = pd.DataFrame(class_data, columns=['class'])\n",
			
 
				+    "class_df_set = set(class_df['class'])\n",
			
 
				+    "\n",
			
 
				+    "with open(raw_file, 'r') as file:\n",
			
 
				+    "    raw_data = json.load(file)\n",
			
 
				+    "\n",
			
 
				+    "raw_df = pd.DataFrame(raw_data)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# 添加实体\n"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 7,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "   split                                           captions  \\\n",
			
 
				+      "0  train  a pedestrian with dark hair is wearing red and...   \n",
			
 
				+      "1  train  a man wearing a black jacket, black pants, red...   \n",
			
 
				+      "2  train  the man is wearing a black jacket, green jeans...   \n",
			
 
				+      "3  train  he's wearing a black hooded sweatshirt with a ...   \n",
			
 
				+      "4  train  the man is walking.  he is wearing a bright gr...   \n",
			
 
				+      "\n",
			
 
				+      "                      file_path  \\\n",
			
 
				+      "0            CUHK01/0363004.png   \n",
			
 
				+      "1            CUHK01/0363003.png   \n",
			
 
				+      "2            CUHK01/0363001.png   \n",
			
 
				+      "3            CUHK01/0363002.png   \n",
			
 
				+      "4  train_query/p8130_s10935.jpg   \n",
			
 
				+      "\n",
			
 
				+      "                                              entity  \n",
			
 
				+      "0  hair,person,shoes,sneakers,sweatshirt,pedestri...  \n",
			
 
				+      "1         man,jacket,hand,shoes,sneakers,shirt,pants  \n",
			
 
				+      "2  man,jacket,jeans,carrying,backpack,sneakers,pants  \n",
			
 
				+      "3  man,hair,hoodie,carrying,backpack,shoes,sneake...  \n",
			
 
				+      "4             man,shoes,sleeved,vest,shirt,hat,pants  \n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "def judge_noun(word):\n",
			
 
				+    "    if word in class_df_set:\n",
			
 
				+    "        return 1\n",
			
 
				+    "    return 0\n",
			
 
				+    "\n",
			
 
				+    "def add_entity(items):\n",
			
 
				+    "    # 合并所有描述并转换为小写\n",
			
 
				+    "    combined_description = ' '.join(items['captions']).lower()\n",
			
 
				+    "    items['captions'] = combined_description\n",
			
 
				+    "    # 分词\n",
			
 
				+    "    all_words = nltk.word_tokenize(combined_description)\n",
			
 
				+    "    # 包含的实体\n",
			
 
				+    "    valid_list = [judge_noun(word) for word in all_words]\n",
			
 
				+    "    valid = sum(valid_list)\n",
			
 
				+    "\n",
			
 
				+    "    if valid:\n",
			
 
				+    "            valid_words = np.array(all_words)[np.argwhere(valid_list)][:,0].tolist()\n",
			
 
				+    "            valid_words = list(set(valid_words)) ## keep unique entities\n",
			
 
				+    "            items['entity'] = ','.join(valid_words)\n",
			
 
				+    "    \n",
			
 
				+    "    return items.filter(items=['split', 'captions', 'file_path', 'entity'])\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "entity_added_df = raw_df.apply(add_entity, axis=1)\n",
			
 
				+    "\n",
			
 
				+    "print(entity_added_df.head())"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# 保存train_entity_add.csv"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 17,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "                                            captions           file_path  \\\n",
			
 
				+      "0  a pedestrian with dark hair is wearing red and...  CUHK01/0363004.png   \n",
			
 
				+      "\n",
			
 
				+      "                                              entity  \n",
			
 
				+      "0  hair,person,shoes,sneakers,sweatshirt,pedestri...  \n",
			
 
				+      "                                            captions  \\\n",
			
 
				+      "0  a man wearing a blue and white stripe tank top...   \n",
			
 
				+      "\n",
			
 
				+      "                      file_path  \\\n",
			
 
				+      "0  train_query/p8848_s17661.jpg   \n",
			
 
				+      "\n",
			
 
				+      "                                              entity  \n",
			
 
				+      "0  man,striped,tank,neck,pair,shoes,around,headph...  \n",
			
 
				+      "                                            captions           file_path  \\\n",
			
 
				+      "0  the man has short, dark hair and wears khaki p...  CUHK01/0107002.png   \n",
			
 
				+      "\n",
			
 
				+      "                                              entity  \n",
			
 
				+      "0  man,hair,khaki,hoodie,hangs,shoulder,jacket,ba...  \n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "processed_train_data = entity_added_df.loc[entity_added_df['split'] == 'train']\n",
			
 
				+    "processed_test_data = entity_added_df.loc[entity_added_df['split'] == 'test']\n",
			
 
				+    "processed_val_data = entity_added_df.loc[entity_added_df['split'] == 'val']\n",
			
 
				+    "\n",
			
 
				+    "del processed_train_data['split']\n",
			
 
				+    "del processed_test_data['split']\n",
			
 
				+    "del processed_val_data['split']\n",
			
 
				+    "\n",
			
 
				+    "# 重置索引并丢弃原始索引\n",
			
 
				+    "processed_train_data = processed_train_data.reset_index(drop=True)\n",
			
 
				+    "processed_test_data = processed_test_data.reset_index(drop=True)\n",
			
 
				+    "processed_val_data = processed_val_data.reset_index(drop=True)\n",
			
 
				+    "\n",
			
 
				+    "print(processed_train_data.head(1))\n",
			
 
				+    "print(processed_test_data.head(1))\n",
			
 
				+    "print(processed_val_data.head(1))\n",
			
 
				+    "\n",
			
 
				+    "processed_train_data.to_csv(f'{dataset_path}/train_entity.csv', index=False)\n",
			
 
				+    "processed_test_data.to_csv(f'{dataset_path}/test_entity.csv', index=False)\n",
			
 
				+    "processed_val_data.to_csv(f'{dataset_path}/val_entity.csv', index=False)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "ovsegmentor",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.10.4"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 2
			
 
				+}
			
--- a/cuhkpedes/cuhkpedes_topk_summarize.ipynb
+++ b/cuhkpedes/cuhkpedes_topk_summarize.ipynb
@@ -9,7 +9,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 15,
			
 
				+   "execution_count": 1,
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -34,7 +34,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 16,
			
 
				+   "execution_count": 2,
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
@@ -61,7 +61,7 @@
 
				        "True"
			
 
				       ]
			
 
				      },
			
 
				-     "execution_count": 16,
			
 
				+     "execution_count": 2,
			
 
				      "metadata": {},
			
 
				      "output_type": "execute_result"
			
 
				     }
			
@@ -93,7 +93,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 17,
			
 
				+   "execution_count": 3,
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -113,7 +113,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 18,
			
 
				+   "execution_count": 4,
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -132,7 +132,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 19,
			
 
				+   "execution_count": 5,
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -149,7 +149,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 20,
			
 
				+   "execution_count": 6,
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -158,7 +158,8 @@
 
				     "    'wearing', 'long', 'grey', 'also', 'colored', 'back', 'left', 'right', 'small', 'top', 'front', 'bottom',\n",
			
 
				     "    'long', 'longer', 'longest', 'length', 'side', 'light', 'stripes', 'something', 'tan', 'stripe', 'print',\n",
			
 
				     "    'picture', 'shopping', 'body', 'design', 'cell', 'color', 'object', 'trim', 'pattern', 'street', 'underneath',\n",
			
 
				-    "    'soles', 'beige', 'sidewalk', 'cargo', 'leather', 'outfit', 'walks', 'hem', 'walking', 'style'\n",
			
 
				+    "    'soles', 'beige', 'sidewalk', 'cargo', 'leather', 'outfit', 'walks', 'hem', 'walking', 'style', 'inside',\n",
			
 
				+    "    'wears', 'item', 'holding', 'carring', 'bright', 'short'\n",
			
 
				     "])"
			
 
				    ]
			
 
				   },
			
@@ -171,7 +172,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 21,
			
 
				+   "execution_count": 7,
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -187,7 +188,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 22,
			
 
				+   "execution_count": 8,
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -214,7 +215,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 23,
			
 
				+   "execution_count": 9,
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -236,7 +237,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 24,
			
 
				+   "execution_count": 10,
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -253,7 +254,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 25,
			
 
				+   "execution_count": 11,
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
@@ -379,7 +380,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 26,
			
 
				+   "execution_count": 12,
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {