Create Dataset¶
Download Global Streetscapes Metadata from NUS-UAL/global-streetscapes¶
In [ ]:
Copied!
from huggingface_hub import hf_hub_download
import pandas as pd
from huggingface_hub import hf_hub_download
import pandas as pd
The following files where chosen since they were considered as quite useful (really subjective though).
In [ ]:
Copied!
# Define dataset ID and target filenames
dataset_id = "NUS-UAL/global-streetscapes"
files = [
"metadata_common_attributes.csv",
"metadata_mly1.csv",
"metadata_mly2.csv",
"osm.csv",
"perception.csv",
"places365.csv",
"season.csv",
"simplemaps.csv",
"contextual.csv",
]
# Download each file from the "data/" folder in the dataset repo
local_paths = []
for fname in files:
path = hf_hub_download(
repo_id=dataset_id,
repo_type="dataset",
filename=f"data/{fname}",
local_dir="data/",
local_dir_use_symlinks=False,
)
local_paths.append(path)
print(f"Downloaded {fname} -> {path}")
# Optional: load one of the CSVs into pandas
import pandas as pd
df = pd.read_csv(local_paths[0])
df.head()
# Define dataset ID and target filenames
dataset_id = "NUS-UAL/global-streetscapes"
files = [
"metadata_common_attributes.csv",
"metadata_mly1.csv",
"metadata_mly2.csv",
"osm.csv",
"perception.csv",
"places365.csv",
"season.csv",
"simplemaps.csv",
"contextual.csv",
]
# Download each file from the "data/" folder in the dataset repo
local_paths = []
for fname in files:
path = hf_hub_download(
repo_id=dataset_id,
repo_type="dataset",
filename=f"data/{fname}",
local_dir="data/",
local_dir_use_symlinks=False,
)
local_paths.append(path)
print(f"Downloaded {fname} -> {path}")
# Optional: load one of the CSVs into pandas
import pandas as pd
df = pd.read_csv(local_paths[0])
df.head()
Combine metadata in one file¶
In [ ]:
Copied!
# Base CSV: Filter by source = 'Mapillary'
metadata_common = pd.read_csv("data/metadata_common_attributes.csv", usecols=['uuid', 'lat', 'lon', 'heading', 'orig_id', 'source'])
metadata_common = metadata_common[metadata_common['source'] == 'Mapillary']
# Function to safely merge another CSV by uuid
def merge_csv(df_base, file_path, usecols):
df_add = pd.read_csv(file_path, usecols=usecols)
return df_base.merge(df_add, on='uuid', how='left')
# Sequentially merge all additional files
metadata_common = merge_csv(metadata_common, "data/metadata_mly1.csv", ['uuid', 'mly_quality_score'])
metadata_common = merge_csv(metadata_common, "data/metadata_mly2.csv", ['uuid', 'mly_computed_compass_angle'])
metadata_common = merge_csv(metadata_common, "data/osm.csv", ['uuid', 'type_highway'])
metadata_common = merge_csv(metadata_common, "data/perception.csv", None) # None = all columns
metadata_common = merge_csv(metadata_common, "data/places365.csv", ['uuid', 'place'])
metadata_common = merge_csv(metadata_common, "data/season.csv", ['uuid', 'season'])
metadata_common = merge_csv(metadata_common, "data/simplemaps.csv", ['uuid', 'city_ascii', 'city_id', 'iso3', 'admin_name'])
metadata_common = merge_csv(metadata_common, "data/contextual.csv", ['uuid', 'platform', 'view_direction', 'quality'])
# Save final result
metadata_common.to_csv("data/joined_metadata.csv", index=False)
print("Final dataset shape:", metadata_common.shape)
print(metadata_common.head())
# Base CSV: Filter by source = 'Mapillary'
metadata_common = pd.read_csv("data/metadata_common_attributes.csv", usecols=['uuid', 'lat', 'lon', 'heading', 'orig_id', 'source'])
metadata_common = metadata_common[metadata_common['source'] == 'Mapillary']
# Function to safely merge another CSV by uuid
def merge_csv(df_base, file_path, usecols):
df_add = pd.read_csv(file_path, usecols=usecols)
return df_base.merge(df_add, on='uuid', how='left')
# Sequentially merge all additional files
metadata_common = merge_csv(metadata_common, "data/metadata_mly1.csv", ['uuid', 'mly_quality_score'])
metadata_common = merge_csv(metadata_common, "data/metadata_mly2.csv", ['uuid', 'mly_computed_compass_angle'])
metadata_common = merge_csv(metadata_common, "data/osm.csv", ['uuid', 'type_highway'])
metadata_common = merge_csv(metadata_common, "data/perception.csv", None) # None = all columns
metadata_common = merge_csv(metadata_common, "data/places365.csv", ['uuid', 'place'])
metadata_common = merge_csv(metadata_common, "data/season.csv", ['uuid', 'season'])
metadata_common = merge_csv(metadata_common, "data/simplemaps.csv", ['uuid', 'city_ascii', 'city_id', 'iso3', 'admin_name'])
metadata_common = merge_csv(metadata_common, "data/contextual.csv", ['uuid', 'platform', 'view_direction', 'quality'])
# Save final result
metadata_common.to_csv("data/joined_metadata.csv", index=False)
print("Final dataset shape:", metadata_common.shape)
print(metadata_common.head())
Build gpd-Dataframe¶
In [ ]:
Copied!
# Read file
data = pd.read_csv("data/joined_metadata.csv")
print(data.head())
# Read file
data = pd.read_csv("data/joined_metadata.csv")
print(data.head())
In [ ]:
Copied!
# Create gdf
gdf = gpd.GeoDataFrame(
data, geometry=gpd.points_from_xy(data.lon, data.lat), crs="EPSG:4326"
)
# Create gdf
gdf = gpd.GeoDataFrame(
data, geometry=gpd.points_from_xy(data.lon, data.lat), crs="EPSG:4326"
)
Connect to PostgreSQL¶
In [ ]:
Copied!
from getpass import getpass
from urllib.parse import quote_plus
# Database connection info
# Change credentials if necessary
host = "localhost"
database = "gis"
user = "moritz"
port = "25432"
password = getpass("Enter your password: ")
# URL-encode the password to handle special characters
encoded_password = quote_plus(password)
# Add the port in the connection string
connection_string = f"postgresql://{user}:{encoded_password}@{host}:{port}/{database}"
from sqlalchemy import create_engine
engine = create_engine(connection_string)
# If you want to use %sql magic in Jupyter
%reload_ext sql
%sql $connection_string
%config SqlMagic.style = 'DEFAULT'
from getpass import getpass
from urllib.parse import quote_plus
# Database connection info
# Change credentials if necessary
host = "localhost"
database = "gis"
user = "moritz"
port = "25432"
password = getpass("Enter your password: ")
# URL-encode the password to handle special characters
encoded_password = quote_plus(password)
# Add the port in the connection string
connection_string = f"postgresql://{user}:{encoded_password}@{host}:{port}/{database}"
from sqlalchemy import create_engine
engine = create_engine(connection_string)
# If you want to use %sql magic in Jupyter
%reload_ext sql
%sql $connection_string
%config SqlMagic.style = 'DEFAULT'
In [ ]:
Copied!
# Create table
gdf.to_postgis('global_streetscapes', engine, if_exists='replace', index=False)
# Create table
gdf.to_postgis('global_streetscapes', engine, if_exists='replace', index=False)
In [ ]:
Copied!
# Create spatial index
%%sql
DROP INDEX IF EXISTS idx_global_streetscapes_spgist_geometry;
CREATE INDEX idx_global_streetscapes_spgist_geometry ON global_streetscapes USING SPGIST (geometry)
# Create spatial index
%%sql
DROP INDEX IF EXISTS idx_global_streetscapes_spgist_geometry;
CREATE INDEX idx_global_streetscapes_spgist_geometry ON global_streetscapes USING SPGIST (geometry)
In [ ]:
Copied!
%%sql
SELECT * FROM global_streetscapes LIMIT 1;
%%sql
SELECT * FROM global_streetscapes LIMIT 1;