Global spatial outlier analysis (YFCC100m)

Summary¶

This notebook contains the code to generate Figure 2, which illustrates several ways to quantify the number of outliers in the YFCC100m dataset.

Preparations¶

Import dependencies

import os, sys
import psycopg2 # Postgres API
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from pathlib import Path
from IPython.display import clear_output, display, HTML, Markdown
from scipy.interpolate import interp1d

module_path = str(Path.cwd().parents[0] / "py")
if module_path not in sys.path:
    sys.path.append(module_path)
from modules import tools, preparations

# define path to output directory (figures etc.)
OUTPUT = Path.cwd().parents[0] / "out" / 'Figure2_data'

OUTPUT.mkdir(exist_ok=True)

Load keys from .env

import os
from dotenv import load_dotenv

dotenv_path = Path.cwd().parents[0] / '.env'
load_dotenv(dotenv_path)

True

db_user = "postgres"
db_pass = os.getenv('POSTGRES_PASSWORD')
crypt_key = os.getenv("CRYPT_KEY")
read_only_user_key = os.getenv("USER_KEY")

db_connection_hlldb = psycopg2.connect(
        host="hlldb",
        port="5432",
        dbname="hlldb",
        user=db_user,
        password=db_pass
)
db_connection_rawdb = psycopg2.connect(
        host="rawdb",
        port="5432",
        dbname="rawdb",
        user=db_user,
        password=db_pass
)

Test connection:

db_query = """
    SELECT 1;
"""
# create pandas DataFrame from database data
df = pd.read_sql_query(db_query, db_connection_hlldb)
display(df.head())

Simplify query by storing procedure:

db_conn_hlldb = tools.DbConn(db_connection_hlldb)
db_conn_hlldb.query("SELECT 1;")
db_conn_rawdb = tools.DbConn(db_connection_rawdb)
db_conn_rawdb.query("SELECT 1;")

If we only want to execute queries, without returning results:

db_cursor_hlldb = db_connection_hlldb.cursor()
db_cursor_hlldb.execute("SELECT 1;")
db_cursor_hlldb.fetchone()
db_cursor_rawdb = db_connection_rawdb.cursor()
db_cursor_rawdb.execute("SELECT 1;")
db_cursor_rawdb.fetchone()

(1,)

In any case of sql error, rollback manually with:

db_connection_hlldb.rollback()
db_connection_rawdb.rollback()

Prepare schemas, connection from raw-db to hll-db¶

db_sql = f'''CREATE SCHEMA IF NOT EXISTS mviews;
ALTER DATABASE rawdb
SET search_path = "$user",
                  social,
                  spatial,
                  temporal,
                  topical,
                  interlinkage,
                  extensions,
                  mviews;
CREATE EXTENSION IF NOT EXISTS pgcrypto SCHEMA extensions;
'''
db_cursor_rawdb.execute(db_sql)
print(db_cursor_rawdb.statusmessage)
# commit changes to db
db_connection_rawdb.commit()

CREATE EXTENSION

db_sql = f'''/* Produce pseudonymized hash of input id with skey
 * - using skey as seed value
 * - sha256 cryptographic hash function
 * - encode in base64 to reduce length of hash
 * - remove trailing '=' from base64 string
 * - return as text
 */
CREATE OR REPLACE FUNCTION 
extensions.crypt_hash (id text, skey text)
RETURNS text
AS $$
    SELECT 
        RTRIM(
            ENCODE(
                HMAC(
                    id::bytea,
                    skey::bytea,
                    'sha256'), 
                'base64'),
            '=')
$$
LANGUAGE SQL
STRICT;
'''
db_cursor_rawdb.execute(db_sql)
print(db_cursor_rawdb.statusmessage)
# commit changes to db
db_connection_rawdb.commit()

CREATE FUNCTION

Select from RAW, Group by GeoHash Coordinates¶

for geohash_precision in range(8, 11):
    db_sql = f'''
    DROP MATERIALIZED VIEW 
    mviews.spatiallatlng_raw_geohash_{geohash_precision:02d};
    '''
    db_cursor_rawdb.execute(db_sql)
    print(db_cursor_rawdb.statusmessage)
# commit changes to db
db_connection_rawdb.commit()

DROP MATERIALIZED VIEW
DROP MATERIALIZED VIEW
DROP MATERIALIZED VIEW

The following queries take quite some time:

%%time
for geohash_precision in range(8, 11):
    db_sql = f'''
    CREATE MATERIALIZED VIEW mviews.spatiallatlng_raw_geohash_{geohash_precision:02d} AS
    SELECT  extensions.crypt_hash(t1.post_guid, '{crypt_key}') as "post_guid",
            ST_Y(ST_PointFromGeoHash(ST_GeoHash(t1.post_latlng, {geohash_precision}), {geohash_precision})) As "latitude", 
            ST_X(ST_PointFromGeoHash(ST_GeoHash(t1.post_latlng, {geohash_precision}), {geohash_precision})) As "longitude", 
            extensions.crypt_hash(t1.user_guid, '{crypt_key}') as "user_guid",
            to_char(t1.post_create_date, 'yyyy-MM-dd') as "post_create_date",
            t1.post_geoaccuracy
    FROM   topical.post t1
    WHERE t1.post_geoaccuracy IN ('place', 'latlng', 'city');
    '''
    db_cursor_rawdb.execute(db_sql)
    print(db_cursor_rawdb.statusmessage)
# commit changes to db
db_connection_rawdb.commit()

SELECT 47843599
SELECT 47843599
SELECT 47843599

Connect hlldb to rawdb¶

On rawdb, create read_only user:

db_sql = f'''
-- cleanup
DROP SERVER IF EXISTS lbsnraw CASCADE;
DROP OWNED BY lbsn_reader;
DROP USER IF EXISTS lbsn_reader;

CREATE USER lbsn_reader WITH
    LOGIN
    INHERIT
    PASSWORD '{read_only_user_key}';

GRANT CONNECT ON DATABASE rawdb TO lbsn_reader;
GRANT USAGE ON SCHEMA mviews TO lbsn_reader;
GRANT SELECT ON ALL TABLES IN SCHEMA mviews TO lbsn_reader;
GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA mviews TO lbsn_reader;
'''
db_cursor_rawdb.execute(db_sql)
print(db_cursor_rawdb.statusmessage)
# commit changes to db
db_connection_rawdb.commit()

GRANT

On hlldb, enable fdw extension anc create connection:

db_sql = f'''
CREATE EXTENSION IF NOT EXISTS postgres_fdw SCHEMA extensions;
CREATE SERVER IF NOT EXISTS lbsnraw FOREIGN DATA WRAPPER postgres_fdw OPTIONS (
    host 'rawdb',
    dbname 'rawdb',
    port '5432',
    keepalives '1',
    keepalives_idle '30',
    keepalives_interval '10',
    keepalives_count '5',
    fetch_size '500000');
DROP USER MAPPING IF EXISTS for postgres SERVER lbsnraw;
CREATE USER MAPPING for postgres
SERVER lbsnraw 
OPTIONS (user 'lbsn_reader', password '{read_only_user_key}');
'''
db_cursor_hlldb.execute(db_sql)
print(db_cursor_hlldb.statusmessage)
# commit changes to db
db_connection_hlldb.commit()

CREATE USER MAPPING

Import to hlldb¶

table_list = ','.join(
    [f'spatiallatlng_raw_geohash_{geohash_precision:02d}' for geohash_precision in range(1, 11)])
db_sql = f'''
DROP FOREIGN TABLE {table_list};
IMPORT FOREIGN SCHEMA mviews 
LIMIT TO (
    {table_list})
FROM SERVER lbsnraw INTO extensions;
'''
db_cursor_hlldb.execute(db_sql)
print(db_cursor_hlldb.statusmessage)
# commit changes to db
db_connection_hlldb.commit()

IMPORT FOREIGN SCHEMA

hlldb preparations¶

Prepare structure for hll aggregate data per coordinate and GeoHash Level

Make sure output data doesn't exist already:

for geohash_precision in range(8, 11):
    db_sql = f'''
    TRUNCATE spatial.latlng_{geohash_precision:02d};
    '''
    db_cursor_hlldb.execute(db_sql)
    print(db_cursor_hlldb.statusmessage)
# commit changes to db
db_connection_hlldb.commit()

TRUNCATE TABLE
TRUNCATE TABLE
TRUNCATE TABLE

Create structure:

for geohash_precision in range(8, 11):
    db_sql = f'''
    CREATE TABLE IF NOT EXISTS spatial.latlng_{geohash_precision:02d} (
      latitude float,
      longitude float,
      PRIMARY KEY (latitude, longitude),
      latlng_geom geometry(Point, 4326) NOT NULL)
    INHERITS (
      social.user_hll, -- e.g. number of users/latlng (=UPL)
      topical.post_hll, -- e.g. number of posts/latlng
      temporal.date_hll -- e.g. number of userdays/latlng (=PUD)
    );
    '''
    db_cursor_hlldb.execute(db_sql)
    print(db_cursor_hlldb.statusmessage)
db_connection_hlldb

CREATE TABLE
CREATE TABLE
CREATE TABLE

Generate aggregate data (hlldb) from rawdb¶

for geohash_precision in range(8, 11):
    db_sql = f'''
    INSERT INTO spatial.latlng_{geohash_precision:02d}(latitude, longitude, user_hll, post_hll, date_hll, latlng_geom)
    SELECT  latitude,
            longitude,
            hll_add_agg(hll_hash_text(user_guid)) as user_hll,
            hll_add_agg(hll_hash_text(post_guid)) as post_hll,
            hll_add_agg(hll_hash_text(user_guid || post_create_date)) as date_hll,
            ST_SetSRID(ST_MakePoint(longitude, latitude), 4326) as latlng_geom
    FROM extensions.spatiallatlng_raw_geohash_{geohash_precision:02d}
    GROUP BY latitude, longitude;
    '''
    db_cursor_hlldb.execute(db_sql)
    print(db_cursor_hlldb.statusmessage)
db_connection_hlldb.commit()

Collect statistics data¶

Get usercount for distinct coordinate statistics¶

for geohash_precision in range(1, 11):
    df_tmp = db_conn_hlldb.query(
        f'''
        SELECT 
            hll_cardinality(user_hll)::int as user_hll
        FROM spatial.latlng_{geohash_precision:02d}
        ''')
    print(
        f"Queried {len(df_tmp)} distinct coordinates with hll "
        f"aggregates (usercount) for GeoHash precision {geohash_precision}")
    df_tmp.to_csv(
        Path('Figure2_data') / f'geohash{geohash_precision:02d}_spatial_latlng_cardinality.csv',
        mode='w', index=False, header=True)

Quried 32 distinct coordinates with hll aggregates for GeoHash precision 1
Quried 739 distinct coordinates with hll aggregates for GeoHash precision 2
Quried 8173 distinct coordinates with hll aggregates for GeoHash precision 3
Quried 73517 distinct coordinates with hll aggregates for GeoHash precision 4
Quried 451949 distinct coordinates with hll aggregates for GeoHash precision 5
Quried 1668277 distinct coordinates with hll aggregates for GeoHash precision 6
Quried 4179536 distinct coordinates with hll aggregates for GeoHash precision 7
Quried 8164054 distinct coordinates with hll aggregates for GeoHash precision 8
Quried 11514949 distinct coordinates with hll aggregates for GeoHash precision 9
Quried 12517634 distinct coordinates with hll aggregates for GeoHash precision 10

Get total statistics¶

Union of all hll sets for postcount, userdays and usercount.

file_path = Path('Figure2_data') / 'total-statistics.csv'
if file_path.exists():
    # fast read from intermediate data
    df_total = pd.read_csv(file_path, index_col=0)
else:
    # query from db
    db_sql = f'''
    SELECT hll_cardinality(hll_union_agg(post_hll))::int as postcount, 
           hll_cardinality(hll_union_agg(date_hll))::int as userdays,
           hll_cardinality(hll_union_agg(user_hll))::int as usercount
    FROM spatial.latlng_01
    '''
    df_total = db_conn_hlldb.query(db_sql)
    # store intermediate data
    df_total.to_csv(
        file_path, mode='w', 
        index=True, header=True)
# show results
df_total.head()

Get the number of total distinct coordinates from raw database (without GeoHash aggregation):

file_path = Path('Figure2_data') / 'distinct-coordinates-yfcc.csv'
if file_path.exists():
    # fast read from intermediate data
    df_distinct_coords = pd.read_csv(file_path, index_col=0)
else:
    # query from db
    db_sql = f'''
    SELECT COUNT(DISTINCT(ST_Y(post_latlng), ST_X(post_latlng)))
    FROM topical.post
    WHERE post_geoaccuracy IN ('place', 'latlng', 'city');
    '''
    df_distinct_coords = db_conn_rawdb.query(db_sql)
    # store intermediate data
    df_distinct_coords.to_csv(
        file_path, mode='w', 
        index=True, header=True)
df_distinct_coords.head()

Get share of user data (partially) hidden in the crowd¶

For usercount

for geohash_precision in range(1, 11):
    df_tmp = db_conn_hlldb.query(
        f'''
        SELECT 
            {geohash_precision:02d} as geohash,
            hll_cardinality(hll_union_agg(a.user_hll))::int as sum_usercount
        FROM (SELECT 
                  s.user_hll as user_hll
              FROM spatial.latlng_{geohash_precision:02d} s
              WHERE hll_cardinality(s.user_hll)::int = 1) a
        ''')
    display(df_tmp.head())
    df_tmp['sum_usercount'].fillna(0, inplace=True)
    print(
        f"At GeoHash precision {geohash_precision}, \n"
        f"{df_tmp['sum_usercount'][0]} of distinct users are "
        f"found at spatial outlier locations "
        f"(of {df_total['usercount'][0]} total users, "
        f"{df_tmp['sum_usercount'][0]/(df_total['usercount'][0]/100):.0f}%).")
    mode = 'a'
    header = False
    if geohash_precision == 1:
        mode = 'w' 
        header = True
    df_tmp.to_csv(
        Path('Figure2_data') / f'outlier-stats-geohash-user.csv', 
        mode=mode, index=False, header=header)

At GeoHash precision 1, 
0 of distinct users are found at spatial outlier locations (of 211078 total users, 0%).

At GeoHash precision 2, 
124 of distinct users are found at spatial outlier locations (of 211078 total users, 0%).

At GeoHash precision 3, 
1256 of distinct users are found at spatial outlier locations (of 211078 total users, 1%).

At GeoHash precision 4, 
9483 of distinct users are found at spatial outlier locations (of 211078 total users, 4%).

At GeoHash precision 5, 
50358 of distinct users are found at spatial outlier locations (of 211078 total users, 24%).

At GeoHash precision 6, 
112175 of distinct users are found at spatial outlier locations (of 211078 total users, 53%).

At GeoHash precision 7, 
158853 of distinct users are found at spatial outlier locations (of 211078 total users, 75%).

At GeoHash precision 8, 
182825 of distinct users are found at spatial outlier locations (of 211078 total users, 87%).

At GeoHash precision 9, 
195680 of distinct users are found at spatial outlier locations (of 211078 total users, 93%).

At GeoHash precision 10, 
199581 of distinct users are found at spatial outlier locations (of 211078 total users, 95%).

For postcount

for geohash_precision in range(1, 11):
    df_tmp = db_conn_hlldb.query(
        f'''
        SELECT 
            {geohash_precision:02d} as geohash,
            hll_cardinality(hll_union_agg(a.post_hll))::int as sum_postcount
        FROM (SELECT 
                  s.post_hll as post_hll
              FROM spatial.latlng_{geohash_precision:02d} s
              WHERE hll_cardinality(s.post_hll)::int = 1) a
        ''')
    display(df_tmp.head())
    df_tmp['sum_postcount'].fillna(0, inplace=True)
    print(
        f"At GeoHash precision {geohash_precision}, \n"
        f"{df_tmp['sum_postcount'][0]} of distinct posts are "
        f"found at spatial outlier locations "
        f"(of {df_total['postcount'][0]} total posts, "
        f"{df_tmp['sum_postcount'][0]/(df_total['postcount'][0]/100):.0f}%).")
    mode = 'a'
    header = False
    if geohash_precision == 1:
        mode = 'w' 
        header = True
    df_tmp.to_csv(
        Path('Figure2_data') / f'outlier-stats-geohash-posts.csv', 
        mode=mode, index=False, header=header)

At GeoHash precision 1, 
0 of distinct posts are found at spatial outlier locations (of 48440022 total posts, 0%).

At GeoHash precision 2, 
96 of distinct posts are found at spatial outlier locations (of 48440022 total posts, 0%).

At GeoHash precision 3, 
1141 of distinct posts are found at spatial outlier locations (of 48440022 total posts, 0%).

At GeoHash precision 4, 
12067 of distinct posts are found at spatial outlier locations (of 48440022 total posts, 0%).

At GeoHash precision 5, 
109477 of distinct posts are found at spatial outlier locations (of 48440022 total posts, 0%).

At GeoHash precision 6, 
588075 of distinct posts are found at spatial outlier locations (of 48440022 total posts, 1%).

At GeoHash precision 7, 
1927963 of distinct posts are found at spatial outlier locations (of 48440022 total posts, 4%).

At GeoHash precision 8, 
4890672 of distinct posts are found at spatial outlier locations (of 48440022 total posts, 10%).

At GeoHash precision 9, 
8444880 of distinct posts are found at spatial outlier locations (of 48440022 total posts, 17%).

At GeoHash precision 10, 
9689164 of distinct posts are found at spatial outlier locations (of 48440022 total posts, 20%).

For userdays

for geohash_precision in range(1, 11):
    df_tmp = db_conn_hlldb.query(
        f'''
        SELECT 
            {geohash_precision:02d} as geohash,
            hll_cardinality(hll_union_agg(a.date_hll))::int as sum_userdays
        FROM (SELECT 
                  s.date_hll as date_hll
              FROM spatial.latlng_{geohash_precision:02d} s
              WHERE hll_cardinality(s.date_hll)::int = 1) a
        ''')
    display(df_tmp.head())
    df_tmp['sum_userdays'].fillna(0, inplace=True)
    print(
        f"At GeoHash precision {geohash_precision}, \n"
        f"{df_tmp['sum_userdays'][0]} of distinct userdays are "
        f"found at spatial outlier locations "
        f"(of {df_total['userdays'][0]} total userdays, "
        f"{df_tmp['sum_userdays'][0]/(df_total['userdays'][0]/100):.0f}%).")
    mode = 'a'
    header = False
    if geohash_precision == 1:
        mode = 'w' 
        header = True
    df_tmp.to_csv(
        Path('Figure2_data') / f'outlier-stats-geohash-userdays.csv', 
        mode=mode, index=False, header=header)

At GeoHash precision 1, 
0 of distinct userdays are found at spatial outlier locations (of 5970374 total userdays, 0%).

At GeoHash precision 2, 
111 of distinct userdays are found at spatial outlier locations (of 5970374 total userdays, 0%).

At GeoHash precision 3, 
1495 of distinct userdays are found at spatial outlier locations (of 5970374 total userdays, 0%).

At GeoHash precision 4, 
15443 of distinct userdays are found at spatial outlier locations (of 5970374 total userdays, 0%).

At GeoHash precision 5, 
130393 of distinct userdays are found at spatial outlier locations (of 5970374 total userdays, 2%).

At GeoHash precision 6, 
593232 of distinct userdays are found at spatial outlier locations (of 5970374 total userdays, 10%).

At GeoHash precision 7, 
1495497 of distinct userdays are found at spatial outlier locations (of 5970374 total userdays, 25%).

At GeoHash precision 8, 
2815392 of distinct userdays are found at spatial outlier locations (of 5970374 total userdays, 47%).

At GeoHash precision 9, 
3667371 of distinct userdays are found at spatial outlier locations (of 5970374 total userdays, 61%).

At GeoHash precision 10, 
3818572 of distinct userdays are found at spatial outlier locations (of 5970374 total userdays, 64%).

Visualization of results¶

Correlation between GeoHash Accuracy and Number of distinct locations¶

home_path = Path.cwd() / 'Figure2_data'
all_files = home_path.glob('*_spatial_latlng_cardinality.csv')

dtypes = {'user_hll': int}

li = []
for filename in sorted(all_files):
    df = pd.read_csv(
        filename, index_col=None, dtype=dtypes, encoding='utf-8')
    li.append(df)

df = pd.concat(li, axis=1, ignore_index=True)
# sort values
df = df.apply(lambda x: x.sort_values(ascending=False).values)
# name index
df.rename_axis('cardinality', inplace=True)

df.head()

Visualize the basics¶

distinct_coords = []
for ix in range(0, 10):
    distinct_coords.append(
        (ix+1, len(df[ix].dropna())))
# append total distinct coords in dataset, 
# without geohash
distinct_coords.append(
    (11, df_distinct_coords["count"][0]))
display(distinct_coords)

[(1, 32),
 (2, 739),
 (3, 8173),
 (4, 73517),
 (5, 451949),
 (6, 1668277),
 (7, 4179536),
 (8, 8164054),
 (9, 11514949),
 (10, 12517634),
 (11, 12692723)]

print(db_cursor_hlldb.statusmessage)

SELECT 1

# create DataFrame using data 
df_counts = pd.DataFrame(
    distinct_coords, columns =['GeoHash Accuracy', 'Distinct Coordinates'])

sns.relplot(x="GeoHash Accuracy", y="Distinct Coordinates", 
            ci="sd", kind="line", data=df_counts, markers=True, marker='o');
plt.ticklabel_format(style='plain', axis='y', useOffset=False)
ax = plt.gca()
# x = decreasing GeoHash accuracy
ax.set_xlim(11, 0)

(11.0, 0.0)

Interpolate & style matplotlib

total_distinct_coordinates = df_distinct_coords["count"][0]
df_counts["Distinct Coordinates Perc"] = df_counts["Distinct Coordinates"] / (total_distinct_coordinates/100)
df_counts["Distinct Coordinates"] = df_counts["Distinct Coordinates"]/1000000
df_counts.head()

# set style
plt.style.use('fivethirtyeight')
# plt.style.use('ggplot')
# plt.style.use('seaborn')
f1 = interp1d(
    x=df_counts["GeoHash Accuracy"], 
    # y=df_counts["Distinct Coordinates"], 
    y=df_counts["Distinct Coordinates Perc"], 
    kind='cubic', fill_value="extrapolate")
xnew = np.linspace(0, 11, num=40, endpoint=True)
plt.ticklabel_format(style='plain', axis='y', useOffset=False)
plt.plot(xnew, f1(xnew), '--', df_counts["GeoHash Accuracy"], df_counts["Distinct Coordinates Perc"],'o')
plt.xticks(np.arange(min(xnew), max(xnew)+1, 1.0))
ax = plt.gca()
# label markers
labels = [f'{i:.2f}' for i in df_counts["Distinct Coordinates Perc"]]
for label, x, y in zip(labels, df_counts["GeoHash Accuracy"], df_counts["Distinct Coordinates Perc"]):
    if float(label) > 0.01:
        plt.annotate(label, xy=(x, y), xytext=(x-0.2, y+0.2))
# general formatting        
ax.set_xlim(10, 0)  # decreasing accuracy
ax.set_xlabel("GeoHash Precision")
ax.set_ylabel("Distinct Coordinates \n (Percentage)")
# plt.gcf().subplots_adjust(bottom=0.15, left=0.15)
plt.tight_layout()
plt.savefig("GeoHash_accuracy_yfcc_coordinates.png", dpi=300)
plt.show()

Percentage of user-coordinate outliers (1-user-cardinality records)¶

How many distinct coordinates at GeoHash lv 5?

len(df[5].dropna())

1668277

How many at lv 9?

len(df[9].dropna())

12517634

How many distinct coordinates at GeoHash lv 10 have a user cardinality of 1? (=frequented by 1 user)

df_temp = df[9].dropna()
len(df_temp[df_temp == 1])

12326103

... almost 100% of coordinates are frequented by only 1 user at the highest GeoHash Level.

Calculate user-coordinate outliers for all GeoHash levels

Drop NaN rows and calculate total percentage of user-coordinate outliers per GeoHash Level

li_counts = []
for ix in range(0,10):
    df_temp = df[ix].dropna()
    li_counts.append((ix+1, len(df_temp[df_temp == 1])/(len(df_temp)/100)))

li_counts

[(1, 0.0),
 (2, 20.56833558863329),
 (3, 26.04918634528325),
 (4, 34.511745582654356),
 (5, 50.00165947927753),
 (6, 68.25191499972726),
 (7, 80.02074871468986),
 (8, 87.49490143009834),
 (9, 95.71464884473218),
 (10, 98.46991052782019)]

df_counts_user_outlier = pd.DataFrame(li_counts, columns =['GeoHash Accuracy', 'counts'])

# set style
plt.style.use('fivethirtyeight')
# plt.style.use('ggplot')
# plt.style.use('seaborn')
f1 = interp1d(
    x=df_counts_user_outlier["GeoHash Accuracy"], 
    y=df_counts_user_outlier["counts"], 
    kind='cubic', fill_value="extrapolate")
xnew = np.linspace(0, 10, num=40, endpoint=True)
plt.ticklabel_format(style='plain', axis='y', useOffset=False)
plt.plot(xnew, f1(xnew), '--', df_counts_user_outlier["GeoHash Accuracy"], df_counts_user_outlier["counts"],'o')
plt.xticks(np.arange(min(xnew), max(xnew)+1, 1.0))
ax = plt.gca()
# label markers
labels = [f'{i:.2f}' for i in df_counts_user_outlier["counts"]]
for label, x, y in zip(labels, df_counts_user_outlier["GeoHash Accuracy"], df_counts_user_outlier["counts"]):
    if float(label) > 0.01:
        plt.annotate(f'{label}%', xy=(x, y), xytext=(x-0.2, y+0.2))
# general formatting        
ax.set_xlim(10, 0)  # decreasing accuracy
ax.set_ylim(0, 100)  # decreasing accuracy
ax.set_xlabel("GeoHash Precision")
ax.set_ylabel('1-Caradinality Cluster \n (Percentage)')
# plt.gcf().subplots_adjust(bottom=0.15, left=0.15)
plt.tight_layout()
plt.savefig("GeoHash_accuracy_yfcc_coordinates_perc-1-d.png", dpi=300)
plt.show()

Percentage of spatial outlier volume for different metrics (user, posts, userdays)¶

The evaluation above verifies that user-coordinate outliers are highly identifiable information. However, only few posts of a user are shared at locations which are highly unique (distinct). Most often, it can be expected that a user will visit highly unique locations (that haven't been shared online) only once or a few times. We can expect that users will primarily frequent locations where also other people have been. Therefore, a significant share of user-locations users should disappear "in the crowd". From a privacy perspective, this is important because such behaviour patterns prevent attackers from gaining a full picture of all locations visited by a single user. Similar effects should be expected for userdays and posts. Lets have a look at the numbers below:

The following file includes estimated unioned counts for all data that is clustered, i.e. that is not in a 1-d coordinate with only 1 count

base_path = (
    Path.cwd() / 
    'Figure2_data')
file_path_post1d = (
    base_path / 
    'outlier-stats-geohash-posts.csv')
file_path_userday1d = (
    base_path / 
    'outlier-stats-geohash-userdays.csv')
file_path_user1d = (
    base_path / 
    'outlier-stats-geohash-user.csv')

dtypes = {'sum_usercount': int,
          'sum_postcount': int,
          'sum_userdays': int}
# .iloc[::-1] will reverse order of index
# descending geohash lvs
df_post1d_sum = pd.read_csv(
    file_path_post1d, index_col="geohash",
    dtype=dtypes, encoding='utf-8').iloc[::-1]
df_userday1d_sum = pd.read_csv(
    file_path_userday1d, index_col="geohash",
    dtype=dtypes, encoding='utf-8').iloc[::-1]
df_user1d_sum = pd.read_csv(
    file_path_user1d, index_col="geohash",
    dtype=dtypes, encoding='utf-8').iloc[::-1]

df_post1d_sum.head()

li_volume_post = []
total_sum = df_total['postcount'][0] 
print(f"Total geotagged postcount in YFCC: {total_sum}")
for ix in range(0,10):
    li_volume_post.append(
        (ix+1, df_post1d_sum["sum_postcount"][ix+1]/(total_sum/100)))
li_volume_post

Total geotagged postcount in YFCC: 48440022

[(1, 0.0),
 (2, 0.00019818322956170417),
 (3, 0.002355490259686505),
 (4, 0.02491121907417796),
 (5, 0.22600526482006966),
 (6, 1.214027111713533),
 (7, 3.9801034772444988),
 (8, 10.096345538406238),
 (9, 17.43368324646921),
 (10, 20.002393888260414)]

li_volume_user = []
total_sum = df_total['usercount'][0] 
print(f"Total usercount (geotagged YFCC data): {total_sum}")
for ix in range(0, 10):
    li_volume_user.append(
        (ix+1, df_user1d_sum["sum_usercount"][ix+1]/(total_sum/100)))
li_volume_user

Total usercount (geotagged YFCC data): 211078

[(1, 0.0),
 (2, 0.058746055960355884),
 (3, 0.595040695856508),
 (4, 4.492652005419797),
 (5, 23.857531339125817),
 (6, 53.143861510910654),
 (7, 75.25796151185817),
 (8, 86.61490065283922),
 (9, 92.70506637356806),
 (10, 94.55319834374022)]

li_volume_userdays = []
total_sum = df_total['userdays'][0] 
print(f"Total userdays (geotagged YFCC data): {total_sum}")
for ix in range(0,10):
    li_volume_userdays.append(
        (ix+1, df_userday1d_sum["sum_userdays"][ix+1]/(total_sum/100)))
li_volume_userdays

Total userdays (geotagged YFCC data): 5970374

[(1, 0.0),
 (2, 0.0018591800111684795),
 (3, 0.025040307357629524),
 (4, 0.25866051272499846),
 (5, 2.184000533299924),
 (6, 9.936261949418915),
 (7, 25.048631794256107),
 (8, 47.15604081084368),
 (9, 61.42615186251314),
 (10, 63.95867327574454)]

df_volume_post = pd.DataFrame(
    li_volume_post, columns =['GeoHash Accuracy', 'counts']) 
df_volume_user = pd.DataFrame(
    li_volume_user, columns =['GeoHash Accuracy', 'counts']) 
df_volume_userdays = pd.DataFrame(
    li_volume_userdays, columns =['GeoHash Accuracy', 'counts'])

Plot together

# set style
plt.style.use('fivethirtyeight')
# plt.style.use('ggplot')
# plt.style.use('seaborn')
f1 = interp1d(
    x=df_counts_user_outlier["GeoHash Accuracy"], 
    y=df_counts_user_outlier["counts"], 
    kind='cubic', fill_value="extrapolate")
f2 = interp1d(
    x=df_volume_post["GeoHash Accuracy"], 
    y=df_volume_post["counts"], 
    kind='cubic', fill_value="extrapolate")
f3 = interp1d(
    x=df_volume_user["GeoHash Accuracy"], 
    y=df_volume_user["counts"], 
    kind='cubic', fill_value="extrapolate")
f4 = interp1d(
    x=df_volume_userdays["GeoHash Accuracy"], 
    y=df_volume_userdays["counts"], 
    kind='cubic', fill_value="extrapolate")
f5 = interp1d(
    x=df_counts["GeoHash Accuracy"], 
    y=df_counts["Distinct Coordinates Perc"], 
    kind='cubic', fill_value="extrapolate")
xnew = np.linspace(1, 10, num=40, endpoint=True)
plt.ticklabel_format(style='plain', axis='y', useOffset=False)
plt.axvline(5, 0, 100, label='Chosen geo-privacy threshold', linestyle=':', alpha=0.5)

marker_plotting_opts = {
    "color": "white",
    "markersize": 8,
    "markeredgewidth": 0.5,
    "markeredgecolor": "grey"}
# ['008fd5', 'fc4f30', 'e5ae38', '6d904f', '8b8b8b', '810f7c']
plt.plot(xnew, f1(xnew), '--', label='User-Coordinate outliers', color="#fc4f30")
plt.plot(xnew, f5(xnew), '--', label='Coordinate outliers', color="#008fd5")
plt.plot(xnew, f3(xnew), '--', label='User outliers', color="#6d904f") 
plt.plot(xnew, f4(xnew), '--', label='Userday outliers', color="#e5ae38") 
plt.plot(xnew, f2(xnew), '--', label='Post outliers', color="#810f7c")
plt.plot(df_counts_user_outlier["GeoHash Accuracy"], df_counts_user_outlier["counts"], 'o', **marker_plotting_opts)
plt.plot(df_volume_post["GeoHash Accuracy"], df_volume_post["counts"], 'o', **marker_plotting_opts)
plt.plot(df_volume_user["GeoHash Accuracy"], df_volume_user["counts"], 'o', **marker_plotting_opts)
plt.plot(df_volume_userdays["GeoHash Accuracy"], df_volume_userdays["counts"], 'o', **marker_plotting_opts)
plt.plot(df_counts["GeoHash Accuracy"], df_counts["Distinct Coordinates Perc"], 'o', **marker_plotting_opts)

plt.xticks(np.arange(min(xnew), max(xnew)+1, 1.0))
ax = plt.gca()
fig = plt.gcf()
fig.set_size_inches(10, 5)
fig.set_facecolor('white')
# fig.suptitle('Volume of global outliers', fontsize=20, color='grey')
ax.legend()
# label markers
labels = [f'{i:.1f}' for i in df_counts_user_outlier["counts"]]
for label, x, y in zip(labels, df_counts_user_outlier["GeoHash Accuracy"], df_counts_user_outlier["counts"]):
    if float(label) > 0.01:
        offset_x = -0.5
        if x == 5:
            offset_x = 1.2
        plt.annotate(f'{label}%', xy=(x, y), xytext=(x+offset_x, y+0.2))
        
# labels = [f'{i:.2f}' for i in df_volume_post["counts"]]
# for label, x, y in zip(labels, df_volume_post["GeoHash Accuracy"], df_volume_post["counts"]):
#     if float(label) < 99:
#         plt.annotate(f'{label}%', xy=(x, y), xytext=(x-0.2, y-0.8))
# general formatting
ax.set_xlim(10.2, 0)  # decreasing accuracy
ax.set_ylim(-3, 101)  # decreasing accuracy
ax.set_xlabel("GeoHash Precision")
ax.set_ylabel('Percentage')
ax.xaxis.label.set_color('grey')
ax.yaxis.label.set_color('grey')
ax.set_facecolor('white')
# ax.grid(which='minor', alpha=0.2)
# ax.grid(which='major', alpha=0.5)
ax.grid(color='grey', linestyle='-', linewidth=0.5)

import matplotlib as mpl
mpl.rcParams["savefig.facecolor"] = 'white'

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.tick_params(axis='x', colors='grey')
ax.tick_params(axis='y', colors='grey')

plt.gcf().subplots_adjust(bottom=0.15, left=0.15)
# plt.tight_layout()
plt.savefig("GeoHash_accuracy_yfcc_coordinates_perc-1-d_combined_notitle.png", dpi=300)
plt.show()

Additional statistics¶

The privacy-preserving effect of HLL depends on the size of HLL sets, with small HLL sets with only one user ID being most vulnerable to intersection attacks.

How many small sets exist in a dataset depends on the overall distribution of data. Data can be clustered or dispersed. In Social Media, many data will feature a long tailed, heavy clustered pattern, enhancing user privacy at crowded places.

These assertions can be quantitatively substantiated with the given dataset.

How many distinct coordinates exist in the raw dataset?

sql_query = """
SELECT COUNT(DISTINCT(ST_Y(post_latlng), ST_X(post_latlng)))
FROM topical.post
WHERE post_geoaccuracy IN ('place', 'latlng', 'city');
"""
query_df = db_conn_rawdb.query(sql_query)
display(query_df.head())

How many distinct user ids exist in the raw dataset?

sql_query = f"""
CREATE MATERIALIZED VIEW IF NOT EXISTS mviews.user_0 AS
    SELECT
        extensions.crypt_hash(
            t1.user_guid, '{crypt_key}') as "user_guid"
    FROM topical.post t1;
"""
db_cursor_rawdb.execute(sql_query)
print(db_cursor_rawdb.statusmessage)

SELECT 100000000

sql_query = f"""
SELECT COUNT(DISTINCT(user_guid))
FROM mviews.user_0 t1;
"""
query_df = db_conn_rawdb.query(sql_query)
display(query_df.head())

db_connection_hlldb.rollback()

If those user ids are stored as individual HLL sets, how many distinct HLL sets exist (a naïve direct count of the binary hll strings)?

sql_query = """
SELECT EXISTS (
   SELECT FROM information_schema.tables 
   WHERE  table_schema = 'mviews'
   AND    table_name   = 'user_0'
   );
"""
result = db_conn_hlldb.query(sql_query)
display(result.empty)

False

if result.empty:
    sql_query = f"""
    IMPORT FOREIGN SCHEMA mviews LIMIT TO (
    user_0
    ) FROM SERVER lbsnraw INTO extensions;
    """
    db_cursor_hlldb.execute(sql_query)
    print(db_cursor_hlldb.statusmessage)

sql_query = f"""
SELECT hll_set_defaults(11, 5, 0, 1);
"""
db_cursor_hlldb.execute(sql_query)
print(db_cursor_hlldb.statusmessage)

SELECT 1

sql_query = f"""
CREATE MATERIALIZED VIEW mviews.user_0_hll AS
    SELECT
        hll_add(hll_empty(), hll_hash_text(user_guid))::text AS user_hll
    FROM extensions.user_0;
"""
db_cursor_hlldb.execute(sql_query)
print(db_cursor_hlldb.statusmessage)

SELECT 100000000

sql_query = f"""
SELECT COUNT(DISTINCT(user_hll))
FROM mviews.user_0_hll t1;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())

If the register width and log2m HLL parameters are reduced, how does it affect the total count distinct of HLL sets?

sql_query = f"""
SELECT hll_set_defaults(10, 4, 0, 1);
CREATE MATERIALIZED VIEW mviews.user_0_hll_v2 AS
    SELECT
        hll_add(hll_empty(), hll_hash_text(user_guid))::text AS user_hll
    FROM extensions.user_0;
"""
db_cursor_hlldb.execute(sql_query)
print(db_cursor_hlldb.statusmessage)

SELECT 100000000

Store changes made so far:

db_connection_hlldb.commit()

Total user estimate for different register width and log2m parameters:

log2m=11 and regwidth=5

sql_query = f"""
SELECT
    hll_cardinality(
        hll_union_agg(
            user_hll::hll))::int AS usercount
FROM mviews.user_0_hll;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())

log2m=10 and regwidth=4

sql_query = f"""
SELECT
    hll_cardinality(
        hll_union_agg(
            user_hll::hll))::int AS usercount
FROM mviews.user_0_hll_v2;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())

Total estimated statistics for postcount, userdays, coordinatescount (Table 2)

Coordinates:

sql_query = f"""
CREATE MATERIALIZED VIEW IF NOT EXISTS mviews.coordinate_0 AS
    SELECT
        (ST_Y(post_latlng)::text || ST_X(post_latlng)::text) AS "coordinate"
    FROM topical.post t1;
"""
db_cursor_rawdb.execute(sql_query)
print(db_cursor_rawdb.statusmessage)

CREATE MATERIALIZED VIEW

sql_query = """
SELECT EXISTS (
   SELECT FROM information_schema.tables 
   WHERE  table_schema = 'extensions'
   AND    table_name   = 'coordinate_0'
   );
"""
result = db_conn_hlldb.query(sql_query)
display(result.empty)

False

if result.loc[0] is False:
    sql_query = f"""
    IMPORT FOREIGN SCHEMA mviews LIMIT TO (
        coordinate_0
    ) FROM SERVER lbsnraw INTO extensions;
    """
    db_cursor_hlldb.execute(sql_query)
    print(db_cursor_hlldb.statusmessage)

sql_query = f"""
SELECT hll_set_defaults(11, 5, 0, 1);
SELECT
    hll_cardinality(
        hll_add_agg(
                hll_hash_text(coordinate)
            )
        )::int AS coordinatecount
FROM extensions.coordinate_0;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())

Postcount:

sql_query = f"""
CREATE MATERIALIZED VIEW IF NOT EXISTS mviews.post_0 AS
    SELECT
        extensions.crypt_hash(
            t1.post_guid, '{crypt_key}') as "post_guid"
    FROM topical.post t1;
"""
db_cursor_rawdb.execute(sql_query)
print(db_cursor_rawdb.statusmessage)

CREATE MATERIALIZED VIEW

rawdb count:

sql_query = f"""
SELECT COUNT(DISTINCT(post_guid))
FROM mviews.post_0;
"""
query_df = db_conn_rawdb.query(sql_query)
display(query_df.head())

hlldb count:

sql_query = """
SELECT EXISTS (
   SELECT FROM information_schema.tables 
   WHERE  table_schema = 'extensions'
   AND    table_name   = 'post_0'
   );
"""
result = db_conn_hlldb.query(sql_query)
display(result)

if result.loc[0] is False:
    sql_query = f"""
    IMPORT FOREIGN SCHEMA mviews LIMIT TO (
        post_0
    ) FROM SERVER lbsnraw INTO extensions;
    """
    db_cursor_hlldb.execute(sql_query)
    print(db_cursor_hlldb.statusmessage)
    
sql_query = f"""
SELECT hll_set_defaults(11, 5, 0, 1);

SELECT
    hll_cardinality(
        hll_add_agg(
                hll_hash_text(post_guid)
            )
        )::int AS postcount
FROM extensions.post_0;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())

User day count:

sql_query = f"""
CREATE MATERIALIZED VIEW IF NOT EXISTS mviews.userday_0 AS
    SELECT
        extensions.crypt_hash(
            (user_guid || to_char(t1.post_create_date, 'yyyy-MM-dd')),
            '{crypt_key}') AS "user_day"
    FROM topical.post t1;
"""
db_cursor_rawdb.execute(sql_query)
print(db_cursor_rawdb.statusmessage)

CREATE MATERIALIZED VIEW

rawdb count:

sql_query = f"""
SELECT COUNT(DISTINCT(user_day))
FROM mviews.userday_0;
"""
query_df = db_conn_rawdb.query(sql_query)
display(query_df.head())

hlldb count:

sql_query = """
SELECT EXISTS (
   SELECT FROM information_schema.tables 
   WHERE  table_schema = 'extensions'
   AND    table_name   = 'userday_0'
   );
"""
result = db_conn_hlldb.query(sql_query)
display(result.empty)

False

if result.loc[0] is False:
    sql_query = f"""
    IMPORT FOREIGN SCHEMA mviews LIMIT TO (
        userday_0
    ) FROM SERVER lbsnraw INTO extensions;
    """
    db_cursor_hlldb.execute(sql_query)
    print(db_cursor_hlldb.statusmessage)
    

sql_query = f"""
SELECT hll_set_defaults(11, 5, 0, 1);

SELECT
    hll_cardinality(
        hll_add_agg(
                hll_hash_text(user_day)
            )
        )::int AS userdays
FROM extensions.userday_0;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())

db_connection_hlldb.commit()

Additional counts of Single-Item-HLL sets

Total number of single item HLL sets for Geohash 5, which reflects the chosen initial granularity of spatial data at collection time.

sql_query = f"""
SELECT COUNT(*)
FROM spatial.latlng
WHERE hll_cardinality(user_hll)::int = 1;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())

print(
    f"{query_df.loc[0]['count']} locations contain only 1 user id at a GeoHash=5. "
    f"This represents {query_df.loc[0]['count']/(12756691 / 100):.2f}% of "
    f"total distinct locations in the YFCC100M dataset")

226025 locations contain only 1 user id at a GeoHash=5. This represents 1.77% of total distinct locations in the YFCC100M dataset

How many distinct users are present in these single-item-hll sets?

sql_query = f"""
SELECT hll_cardinality(hll_union_agg(user_hll))::int
FROM spatial.latlng
WHERE hll_cardinality(user_hll)::int = 1;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())

print(
    f"{query_df.loc[0]['hll_cardinality']} users have visited at least one location where no other user has been at GeoHash=5. "
    f"This represents {query_df.loc[0]['hll_cardinality']/(597234 / 100):.2f}% of "
    f"total users in the YFCC100M dataset")

50358 users have visited at least one location where no other user has been at GeoHash=5. This represents 8.43% of total users in the YFCC100M dataset

Total number of single item HLL sets for 100km bin collection, which reflects the chosen granularity of spatial data at data publish time (db internal).

# load union_all_hll() methods from _04_interpretation notebook
from _04_interpretation import *

grid = grid_agg_fromcsv(
    OUTPUT / "csv" / "yfcc_all_est_benchmark_internal.csv",
    columns=["xbin", "ybin", "usercount_hll"],
    metrics=["usercount_est"],
    grid_size=GRID_SIZE_METERS)

len(grid[grid["usercount_est"] == 1])

3354

len(grid[grid["usercount_est"] > 1])

9235

print(
    f"{len(grid[grid['usercount_est'] == 1])} grid bins (100km) have a user count cardinality of 1. "
    f"This represents {len(grid[grid['usercount_est'] == 1])/(len(grid[grid['usercount_est'] >= 1]) / 100):.2f}% of "
    f"grid bins with data")

3354 grid bins (100km) have a user count cardinality of 1. This represents 26.64% of grid bins with data

However, how many distinct users are present in these single-item-hll sets of 100km bins?

cardinality_series = union_all_hll(
        grid[grid["usercount_est"] == 1]["usercount_hll"].dropna(),
        db_conn = db_conn_hlldb)

cardinality_series[0]

1833

print(
    f"{cardinality_series[0]} users have at least one grid bin (100km) where no other user has been. "
    f"This represents {cardinality_series[0]/(597234 / 100):.2f}% of "
    f"total users in the YFCC100M dataset")

1833 users have at least one grid bin (100km) where no other user has been. This represents 0.31% of total users in the YFCC100M dataset

Published benchmark data (user count >= 100): How many total posts are included in bins >= 10000 post count (high privacy preserving effect according to Desfontaines):

grid = grid_agg_fromcsv(
    OUTPUT / "csv" / "yfcc_all_est_benchmark_internal.csv",
    columns=["xbin", "ybin", "postcount_hll"],
    metrics=["postcount_est"],
    grid_size=GRID_SIZE_METERS)

cardinality_series = union_all_hll(
        grid[grid["postcount_est"] >= 10000]["postcount_hll"].dropna(),
        db_conn = db_conn_hlldb)

cardinality_series[0]

41582251

48,469,829 total geotagged posts in YFCC100M dataset with chosen

print(
    f"{cardinality_series[0]} of posts are from bins (100km) with >= 10000 total posts. "
    f"This represents {cardinality_series[0]/(48469829 / 100):.2f}% of "
    f"total posts in the YFCC100M dataset")

41582251 of posts are from bins (100km) with >= 10000 total posts. This represents 85.79% of total posts in the YFCC100M dataset

Convert notebook to html

!jupyter nbconvert --to html_toc \
    --output-dir=../out/html ./Figure2_outlier_analysis.ipynb \
    --template=../nbconvert.tpl \
    --ExtractOutputPreprocessor.enabled=False # create single output file

[NbConvertApp] Converting notebook ./Figure2_outlier_analysis.ipynb to html_toc
[NbConvertApp] Writing 706338 bytes to ../out/html/Figure2_outlier_analysis.html

	0	1	2	3	4	5	6	7	8	9
cardinality
0	69701.0	33910.0	20418.0	16451.0	11329.0	4935.0	2652.0	1357.0	1337.0	1326
1	58624.0	32846.0	17482.0	14482.0	10673.0	4374.0	1718.0	1187.0	1155.0	1105
2	44857.0	29204.0	13965.0	11675.0	9110.0	4259.0	1643.0	1066.0	977.0	977
3	39845.0	27001.0	12430.0	10396.0	7707.0	3854.0	1543.0	864.0	829.0	817
4	38387.0	24979.0	8818.0	9496.0	7614.0	3739.0	1535.0	782.0	747.0	747

	GeoHash Accuracy	Distinct Coordinates	Distinct Coordinates Perc
0	1	0.000032	0.000252
1	2	0.000739	0.005822
2	3	0.008173	0.064391
3	4	0.073517	0.579206
4	5	0.451949	3.560694

Global spatial outlier analysis (YFCC100m)

Table of Contents

Summary¶

Preparations¶

Prepare schemas, connection from raw-db to hll-db¶

Select from RAW, Group by GeoHash Coordinates¶

Connect hlldb to rawdb¶

Import to hlldb¶

hlldb preparations¶

Generate aggregate data (hlldb) from rawdb¶

Collect statistics data¶

Get usercount for distinct coordinate statistics¶

Get total statistics¶

Get share of user data (partially) hidden in the crowd¶

Visualization of results¶

Correlation between GeoHash Accuracy and Number of distinct locations¶

Visualize the basics¶

Percentage of user-coordinate outliers (1-user-cardinality records)¶

Percentage of spatial outlier volume for different metrics (user, posts, userdays)¶

Additional statistics¶