| import pandas as pd |
| import torch |
| import numpy as np |
| from os.path import join |
| import matplotlib.pyplot as plt |
| import hydra |
|
|
|
|
| class QuadTree(object): |
| def __init__(self, data, mins=None, maxs=None, id="", depth=3, do_split=1000): |
| self.id = id |
| self.data = data |
|
|
| if mins is None: |
| mins = data[["latitude", "longitude"]].to_numpy().min(0) |
| if maxs is None: |
| maxs = data[["latitude", "longitude"]].to_numpy().max(0) |
|
|
| self.mins = np.asarray(mins) |
| self.maxs = np.asarray(maxs) |
| self.sizes = self.maxs - self.mins |
|
|
| self.children = [] |
|
|
| mids = 0.5 * (self.mins + self.maxs) |
| xmin, ymin = self.mins |
| xmax, ymax = self.maxs |
| xmid, ymid = mids |
|
|
| if (depth > 0) and (len(self.data) >= do_split): |
| |
| data_q1 = data[(data["latitude"] < mids[0]) & (data["longitude"] < mids[1])] |
| data_q2 = data[ |
| (data["latitude"] < mids[0]) & (data["longitude"] >= mids[1]) |
| ] |
| data_q3 = data[ |
| (data["latitude"] >= mids[0]) & (data["longitude"] < mids[1]) |
| ] |
| data_q4 = data[ |
| (data["latitude"] >= mids[0]) & (data["longitude"] >= mids[1]) |
| ] |
|
|
| |
| if data_q1.shape[0] > 0: |
| self.children.append( |
| QuadTree( |
| data_q1, |
| [xmin, ymin], |
| [xmid, ymid], |
| id + "0", |
| depth - 1, |
| do_split=do_split, |
| ) |
| ) |
| if data_q2.shape[0] > 0: |
| self.children.append( |
| QuadTree( |
| data_q2, |
| [xmin, ymid], |
| [xmid, ymax], |
| id + "1", |
| depth - 1, |
| do_split=do_split, |
| ) |
| ) |
| if data_q3.shape[0] > 0: |
| self.children.append( |
| QuadTree( |
| data_q3, |
| [xmid, ymin], |
| [xmax, ymid], |
| id + "2", |
| depth - 1, |
| do_split=do_split, |
| ) |
| ) |
| if data_q4.shape[0] > 0: |
| self.children.append( |
| QuadTree( |
| data_q4, |
| [xmid, ymid], |
| [xmax, ymax], |
| id + "3", |
| depth - 1, |
| do_split=do_split, |
| ) |
| ) |
|
|
| def unwrap(self): |
| if len(self.children) == 0: |
| return {self.id: [self.mins, self.maxs, self.data.copy()]} |
| else: |
| d = dict() |
| for child in self.children: |
| d.update(child.unwrap()) |
| return d |
|
|
|
|
| def extract(qt, name_new_column): |
| cluster = qt.unwrap() |
| boundaries, data = {}, [] |
| id_to_quad = np.array(list(cluster.keys())) |
| for i, (id, vs) in zip(np.arange(len(cluster)), cluster.items()): |
| (min_lat, min_lon), (max_lat, max_lon), points = vs |
| points[name_new_column] = int(i) |
| data.append(points) |
| boundaries[i] = ( |
| float(min_lat), |
| float(min_lon), |
| float(max_lat), |
| float(max_lon), |
| points["latitude"].mean(), |
| points["longitude"].mean(), |
| ) |
|
|
| data = pd.concat(data) |
| return boundaries, data, id_to_quad |
|
|
|
|
| def vizu(name_new_column, df_train, boundaries, save_path): |
| plt.hist(df_train[name_new_column], bins=len(boundaries)) |
| plt.xlabel("Cluster ID") |
| plt.ylabel("Number of images") |
| plt.title("Cluster distribution") |
| plt.yscale("log") |
| plt.savefig(join(save_path, f"{name_new_column}_distrib.png")) |
| plt.clf() |
|
|
| plt.scatter( |
| df_train["longitude"].to_numpy(), |
| df_train["latitude"].to_numpy(), |
| c=np.random.permutation(len(boundaries))[df_train[name_new_column].to_numpy()], |
| cmap="tab20", |
| s=0.1, |
| alpha=0.5, |
| ) |
| plt.xlabel("Longitude") |
| plt.ylabel("Latitude") |
| plt.title("Quadtree map") |
| plt.savefig(join(save_path, f"{name_new_column}_map.png")) |
|
|
|
|
| @hydra.main( |
| config_path="../../configs/scripts", |
| config_name="preprocess", |
| version_base=None, |
| ) |
| def main(cfg): |
| data_path = join(cfg.data_dir, "osv5m") |
| save_path = cfg.data_dir |
| name_new_column = f"quadtree_{cfg.depth}_{cfg.do_split}" |
|
|
| |
| train_fp = join(data_path, f"train.csv") |
| df_train = pd.read_csv(train_fp, low_memory=False) |
|
|
| qt = QuadTree(df_train, depth=cfg.depth, do_split=cfg.do_split) |
| boundaries, df_train, id_to_quad = extract(qt, name_new_column) |
|
|
| vizu(name_new_column, df_train, boundaries, save_path) |
|
|
| |
| boundaries = pd.DataFrame.from_dict( |
| boundaries, |
| orient="index", |
| columns=["min_lat", "min_lon", "max_lat", "max_lon", "mean_lat", "mean_lon"], |
| ) |
| boundaries.to_csv( |
| join(save_path, f"{name_new_column}.csv"), index_label="cluster_id" |
| ) |
|
|
| |
| test_fp = join(data_path, f"test.csv") |
| df_test = pd.read_csv(test_fp) |
|
|
| above_lat = np.expand_dims(df_test["latitude"].to_numpy(), -1) > np.expand_dims( |
| boundaries["min_lat"].to_numpy(), 0 |
| ) |
| below_lat = np.expand_dims(df_test["latitude"].to_numpy(), -1) < np.expand_dims( |
| boundaries["max_lat"].to_numpy(), 0 |
| ) |
| above_lon = np.expand_dims(df_test["longitude"].to_numpy(), -1) > np.expand_dims( |
| boundaries["min_lon"].to_numpy(), 0 |
| ) |
| below_lon = np.expand_dims(df_test["longitude"].to_numpy(), -1) < np.expand_dims( |
| boundaries["max_lon"].to_numpy(), 0 |
| ) |
|
|
| mask = np.logical_and( |
| np.logical_and(above_lat, below_lat), np.logical_and(above_lon, below_lon) |
| ) |
|
|
| df_test[name_new_column] = np.argmax(mask, axis=1) |
|
|
| |
| lat = torch.tensor(boundaries["mean_lat"]) |
| lon = torch.tensor(boundaries["mean_lon"]) |
| coord = torch.stack([lat, lon], dim=-1) |
| torch.save( |
| coord, join(save_path, f"index_to_gps_quadtree_{cfg.depth}_{cfg.do_split}.pt") |
| ) |
|
|
| torch.save(id_to_quad, join(save_path, f"id_to_quad_{cfg.depth}_{cfg.do_split}.pt")) |
| |
| if cfg.overwrite_csv: |
| df_train.to_csv(train_fp, index=False) |
| df_test.to_csv(test_fp, index=False) |
|
|
| df = pd.read_csv(join(data_path, "train.csv"), low_memory=False).fillna("NaN") |
| |
| country_avg = ( |
| df.groupby("unique_country")[["latitude", "longitude"]].mean().reset_index() |
| ) |
| country_avg.to_csv( |
| join(save_path, "country_center.csv"), |
| columns=["unique_country", "latitude", "longitude"], |
| index=False, |
| ) |
| |
| region_avg = ( |
| df.groupby(["unique_region"])[["latitude", "longitude"]].mean().reset_index() |
| ) |
| region_avg.to_csv( |
| join(save_path, "region_center.csv"), |
| columns=["unique_region", "latitude", "longitude"], |
| index=False, |
| ) |
| |
| area_avg = ( |
| df.groupby(["unique_sub-region"])[["latitude", "longitude"]] |
| .mean() |
| .reset_index() |
| ) |
| area_avg.to_csv( |
| join(save_path, "sub-region_center.csv"), |
| columns=["unique_sub-region", "latitude", "longitude"], |
| index=False, |
| ) |
| |
| city_avg = ( |
| df.groupby(["unique_city"])[["latitude", "longitude"]].mean().reset_index() |
| ) |
| city_avg.to_csv( |
| join(save_path, "city_center.csv"), |
| columns=["unique_city", "latitude", "longitude"], |
| index=False, |
| ) |
|
|
| for class_name in [ |
| "unique_country", |
| "unique_sub-region", |
| "unique_region", |
| "unique_city", |
| ]: |
| |
| csv_file = class_name.split("_")[-1] + "_center.csv" |
| df = pd.read_csv(join(save_path, csv_file), low_memory=False) |
|
|
| splits = ["train"] |
| categories = sorted( |
| pd.concat( |
| [ |
| pd.read_csv( |
| join(data_path, f"{split}.csv"), low_memory=False |
| )[class_name] |
| for split in splits |
| ] |
| ) |
| .fillna("NaN") |
| .unique() |
| .tolist() |
| ) |
|
|
| if "NaN" in categories: |
| categories.remove("NaN") |
|
|
| |
| num_classes = len(categories) |
|
|
| |
| category_to_index = {category: i for i, category in enumerate(categories)} |
|
|
| dictionary = torch.zeros((num_classes, 2)) |
| for index, row in df.iterrows(): |
| key = row.iloc[0] |
| value = [row.iloc[1], row.iloc[2]] |
| if key in categories: |
| ( |
| dictionary[category_to_index[key], 0], |
| dictionary[category_to_index[key], 1], |
| ) = np.radians(row.iloc[1]), np.radians(row.iloc[2]) |
|
|
| |
| output_file = join(save_path, "index_to_gps_" + class_name + ".pt") |
| torch.save(dictionary, output_file) |
|
|
| train = pd.read_csv(join(data_path, "train.csv"), low_memory=False).fillna( |
| "NaN" |
| ) |
|
|
| u = train.groupby("unique_city").sample(n=1) |
|
|
| country_df = ( |
| u.pivot(index="unique_city", columns="unique_country", values="unique_city") |
| .notna() |
| .astype(int) |
| .fillna(0) |
| ) |
| country_to_idx = { |
| category: i for i, category in enumerate(list(country_df.columns)) |
| } |
| city_country_matrix = torch.tensor(country_df.values) / 1.0 |
|
|
| region_df = ( |
| u.pivot(index="unique_city", columns="unique_region", values="unique_city") |
| .notna() |
| .astype(int) |
| .fillna(0) |
| ) |
| region_to_idx = {category: i for i, category in enumerate(list(region_df.columns))} |
| city_region_matrix = torch.tensor(region_df.values) / 1.0 |
|
|
| country_df = ( |
| u.pivot(index="unique_city", columns="unique_country", values="unique_city") |
| .notna() |
| .astype(int) |
| .fillna(0) |
| ) |
| country_to_idx = { |
| category: i for i, category in enumerate(list(country_df.columns)) |
| } |
| city_country_matrix = torch.tensor(country_df.values) / 1.0 |
|
|
| output_file = join(save_path, "city_to_country.pt") |
| torch.save(city_country_matrix, output_file) |
|
|
| output_file = join(save_path, "country_to_idx.pt") |
| torch.save(country_to_idx, output_file) |
|
|
| region_df = ( |
| u.pivot(index="unique_city", columns="unique_region", values="unique_city") |
| .notna() |
| .astype(int) |
| .fillna(0) |
| ) |
| region_to_idx = {category: i for i, category in enumerate(list(region_df.columns))} |
| city_region_matrix = torch.tensor(region_df.values) / 1.0 |
|
|
| output_file = join(save_path, "city_to_region.pt") |
| torch.save(city_region_matrix, output_file) |
|
|
| output_file = join(save_path, "region_to_idx.pt") |
| torch.save(region_to_idx, output_file) |
|
|
| area_df = ( |
| u.pivot(index="unique_city", columns="unique_sub-region", values="unique_city") |
| .notna() |
| .astype(int) |
| .fillna(0) |
| ) |
| area_to_idx = {category: i for i, category in enumerate(list(area_df.columns))} |
| city_area_matrix = torch.tensor(area_df.values) / 1.0 |
|
|
| output_file = join(save_path, "city_to_area.pt") |
| torch.save(city_area_matrix, output_file) |
|
|
| output_file = join(save_path, "area_to_idx.pt") |
| torch.save(area_to_idx, output_file) |
| gt = torch.load(join(save_path, f"id_to_quad_{cfg.depth}_{cfg.do_split}.pt")) |
| matrixes = [] |
| dicts = [] |
| for i in range(1, cfg.depth): |
| |
| l = [s[: cfg.depth - i] if len(s) >= cfg.depth + 1 - i else s for s in gt] |
|
|
| |
| h = list(set(l)) |
|
|
| |
| h_dict = {value: index for index, value in enumerate(h)} |
| dicts.append(h_dict) |
|
|
| |
| matrix = torch.zeros((len(gt), len(h))) |
|
|
| |
| for h in range(len(gt)): |
| j = h_dict[l[h]] |
| matrix[h, j] = 1 |
| matrixes.append(matrix) |
|
|
| output_file = join(save_path, "quadtree_matrixes.pt") |
| torch.save(matrixes, output_file) |
|
|
| output_file = join(save_path, "quadtree_dicts.pt") |
| torch.save(dicts, output_file) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|