Global spatial outlier analysis (YFCC100m)

Summary

This notebook contains the code to generate Figure 2, which illustrates several ways to quantify the number of outliers in the YFCC100m dataset.

Preparations

Import dependencies

In [27]:
import os, sys
import psycopg2 # Postgres API
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from pathlib import Path
from IPython.display import clear_output, display, HTML, Markdown
from scipy.interpolate import interp1d
In [28]:
module_path = str(Path.cwd().parents[0] / "py")
if module_path not in sys.path:
    sys.path.append(module_path)
from modules import tools, preparations
In [29]:
# define path to output directory (figures etc.)
OUTPUT = Path.cwd().parents[0] / "out" / 'Figure2_data'
In [30]:
OUTPUT.mkdir(exist_ok=True)

Load keys from .env

In [31]:
import os
from dotenv import load_dotenv

dotenv_path = Path.cwd().parents[0] / '.env'
load_dotenv(dotenv_path)
Out[31]:
True
In [32]:
db_user = "postgres"
db_pass = os.getenv('POSTGRES_PASSWORD')
crypt_key = os.getenv("CRYPT_KEY")
read_only_user_key = os.getenv("USER_KEY")
In [33]:
db_connection_hlldb = psycopg2.connect(
        host="hlldb",
        port="5432",
        dbname="hlldb",
        user=db_user,
        password=db_pass
)
db_connection_rawdb = psycopg2.connect(
        host="rawdb",
        port="5432",
        dbname="rawdb",
        user=db_user,
        password=db_pass
)

Test connection:

In [34]:
db_query = """
    SELECT 1;
"""
# create pandas DataFrame from database data
df = pd.read_sql_query(db_query, db_connection_hlldb)
display(df.head())
?column?
0 1

Simplify query by storing procedure:

In [35]:
db_conn_hlldb = tools.DbConn(db_connection_hlldb)
db_conn_hlldb.query("SELECT 1;")
db_conn_rawdb = tools.DbConn(db_connection_rawdb)
db_conn_rawdb.query("SELECT 1;")
Out[35]:
?column?
0 1

If we only want to execute queries, without returning results:

In [36]:
db_cursor_hlldb = db_connection_hlldb.cursor()
db_cursor_hlldb.execute("SELECT 1;")
db_cursor_hlldb.fetchone()
db_cursor_rawdb = db_connection_rawdb.cursor()
db_cursor_rawdb.execute("SELECT 1;")
db_cursor_rawdb.fetchone()
Out[36]:
(1,)

In any case of sql error, rollback manually with:

In [13]:
db_connection_hlldb.rollback()
db_connection_rawdb.rollback()

Prepare schemas, connection from raw-db to hll-db

In [63]:
db_sql = f'''CREATE SCHEMA IF NOT EXISTS mviews;
ALTER DATABASE rawdb
SET search_path = "$user",
                  social,
                  spatial,
                  temporal,
                  topical,
                  interlinkage,
                  extensions,
                  mviews;
CREATE EXTENSION IF NOT EXISTS pgcrypto SCHEMA extensions;
'''
db_cursor_rawdb.execute(db_sql)
print(db_cursor_rawdb.statusmessage)
# commit changes to db
db_connection_rawdb.commit()
CREATE EXTENSION
In [64]:
db_sql = f'''/* Produce pseudonymized hash of input id with skey
 * - using skey as seed value
 * - sha256 cryptographic hash function
 * - encode in base64 to reduce length of hash
 * - remove trailing '=' from base64 string
 * - return as text
 */
CREATE OR REPLACE FUNCTION 
extensions.crypt_hash (id text, skey text)
RETURNS text
AS $$
    SELECT 
        RTRIM(
            ENCODE(
                HMAC(
                    id::bytea,
                    skey::bytea,
                    'sha256'), 
                'base64'),
            '=')
$$
LANGUAGE SQL
STRICT;
'''
db_cursor_rawdb.execute(db_sql)
print(db_cursor_rawdb.statusmessage)
# commit changes to db
db_connection_rawdb.commit()
CREATE FUNCTION

Select from RAW, Group by GeoHash Coordinates

In [65]:
for geohash_precision in range(8, 11):
    db_sql = f'''
    DROP MATERIALIZED VIEW 
    mviews.spatiallatlng_raw_geohash_{geohash_precision:02d};
    '''
    db_cursor_rawdb.execute(db_sql)
    print(db_cursor_rawdb.statusmessage)
# commit changes to db
db_connection_rawdb.commit()
DROP MATERIALIZED VIEW
DROP MATERIALIZED VIEW
DROP MATERIALIZED VIEW

The following queries take quite some time:

In [73]:
%%time
for geohash_precision in range(8, 11):
    db_sql = f'''
    CREATE MATERIALIZED VIEW mviews.spatiallatlng_raw_geohash_{geohash_precision:02d} AS
    SELECT  extensions.crypt_hash(t1.post_guid, '{crypt_key}') as "post_guid",
            ST_Y(ST_PointFromGeoHash(ST_GeoHash(t1.post_latlng, {geohash_precision}), {geohash_precision})) As "latitude", 
            ST_X(ST_PointFromGeoHash(ST_GeoHash(t1.post_latlng, {geohash_precision}), {geohash_precision})) As "longitude", 
            extensions.crypt_hash(t1.user_guid, '{crypt_key}') as "user_guid",
            to_char(t1.post_create_date, 'yyyy-MM-dd') as "post_create_date",
            t1.post_geoaccuracy
    FROM   topical.post t1
    WHERE t1.post_geoaccuracy IN ('place', 'latlng', 'city');
    '''
    db_cursor_rawdb.execute(db_sql)
    print(db_cursor_rawdb.statusmessage)
# commit changes to db
db_connection_rawdb.commit()
SELECT 47843599
SELECT 47843599
SELECT 47843599

Connect hlldb to rawdb

On rawdb, create read_only user:

In [87]:
db_sql = f'''
-- cleanup
DROP SERVER IF EXISTS lbsnraw CASCADE;
DROP OWNED BY lbsn_reader;
DROP USER IF EXISTS lbsn_reader;

CREATE USER lbsn_reader WITH
    LOGIN
    INHERIT
    PASSWORD '{read_only_user_key}';

GRANT CONNECT ON DATABASE rawdb TO lbsn_reader;
GRANT USAGE ON SCHEMA mviews TO lbsn_reader;
GRANT SELECT ON ALL TABLES IN SCHEMA mviews TO lbsn_reader;
GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA mviews TO lbsn_reader;
'''
db_cursor_rawdb.execute(db_sql)
print(db_cursor_rawdb.statusmessage)
# commit changes to db
db_connection_rawdb.commit()
GRANT

On hlldb, enable fdw extension anc create connection:

In [97]:
db_sql = f'''
CREATE EXTENSION IF NOT EXISTS postgres_fdw SCHEMA extensions;
CREATE SERVER IF NOT EXISTS lbsnraw FOREIGN DATA WRAPPER postgres_fdw OPTIONS (
    host 'rawdb',
    dbname 'rawdb',
    port '5432',
    keepalives '1',
    keepalives_idle '30',
    keepalives_interval '10',
    keepalives_count '5',
    fetch_size '500000');
DROP USER MAPPING IF EXISTS for postgres SERVER lbsnraw;
CREATE USER MAPPING for postgres
SERVER lbsnraw 
OPTIONS (user 'lbsn_reader', password '{read_only_user_key}');
'''
db_cursor_hlldb.execute(db_sql)
print(db_cursor_hlldb.statusmessage)
# commit changes to db
db_connection_hlldb.commit()
CREATE USER MAPPING

Import to hlldb

In [102]:
table_list = ','.join(
    [f'spatiallatlng_raw_geohash_{geohash_precision:02d}' for geohash_precision in range(1, 11)])
db_sql = f'''
DROP FOREIGN TABLE {table_list};
IMPORT FOREIGN SCHEMA mviews 
LIMIT TO (
    {table_list})
FROM SERVER lbsnraw INTO extensions;
'''
db_cursor_hlldb.execute(db_sql)
print(db_cursor_hlldb.statusmessage)
# commit changes to db
db_connection_hlldb.commit()
IMPORT FOREIGN SCHEMA

hlldb preparations

Prepare structure for hll aggregate data per coordinate and GeoHash Level

Make sure output data doesn't exist already:

In [38]:
for geohash_precision in range(8, 11):
    db_sql = f'''
    TRUNCATE spatial.latlng_{geohash_precision:02d};
    '''
    db_cursor_hlldb.execute(db_sql)
    print(db_cursor_hlldb.statusmessage)
# commit changes to db
db_connection_hlldb.commit()
TRUNCATE TABLE
TRUNCATE TABLE
TRUNCATE TABLE

Create structure:

In [40]:
for geohash_precision in range(8, 11):
    db_sql = f'''
    CREATE TABLE IF NOT EXISTS spatial.latlng_{geohash_precision:02d} (
      latitude float,
      longitude float,
      PRIMARY KEY (latitude, longitude),
      latlng_geom geometry(Point, 4326) NOT NULL)
    INHERITS (
      social.user_hll, -- e.g. number of users/latlng (=UPL)
      topical.post_hll, -- e.g. number of posts/latlng
      temporal.date_hll -- e.g. number of userdays/latlng (=PUD)
    );
    '''
    db_cursor_hlldb.execute(db_sql)
    print(db_cursor_hlldb.statusmessage)
db_connection_hlldb
CREATE TABLE
CREATE TABLE
CREATE TABLE

Generate aggregate data (hlldb) from rawdb

In [ ]:
for geohash_precision in range(8, 11):
    db_sql = f'''
    INSERT INTO spatial.latlng_{geohash_precision:02d}(latitude, longitude, user_hll, post_hll, date_hll, latlng_geom)
    SELECT  latitude,
            longitude,
            hll_add_agg(hll_hash_text(user_guid)) as user_hll,
            hll_add_agg(hll_hash_text(post_guid)) as post_hll,
            hll_add_agg(hll_hash_text(user_guid || post_create_date)) as date_hll,
            ST_SetSRID(ST_MakePoint(longitude, latitude), 4326) as latlng_geom
    FROM extensions.spatiallatlng_raw_geohash_{geohash_precision:02d}
    GROUP BY latitude, longitude;
    '''
    db_cursor_hlldb.execute(db_sql)
    print(db_cursor_hlldb.statusmessage)
db_connection_hlldb.commit()

Collect statistics data

Get usercount for distinct coordinate statistics

In [18]:
for geohash_precision in range(1, 11):
    df_tmp = db_conn_hlldb.query(
        f'''
        SELECT 
            hll_cardinality(user_hll)::int as user_hll
        FROM spatial.latlng_{geohash_precision:02d}
        ''')
    print(
        f"Queried {len(df_tmp)} distinct coordinates with hll "
        f"aggregates (usercount) for GeoHash precision {geohash_precision}")
    df_tmp.to_csv(
        Path('Figure2_data') / f'geohash{geohash_precision:02d}_spatial_latlng_cardinality.csv',
        mode='w', index=False, header=True)
Quried 32 distinct coordinates with hll aggregates for GeoHash precision 1
Quried 739 distinct coordinates with hll aggregates for GeoHash precision 2
Quried 8173 distinct coordinates with hll aggregates for GeoHash precision 3
Quried 73517 distinct coordinates with hll aggregates for GeoHash precision 4
Quried 451949 distinct coordinates with hll aggregates for GeoHash precision 5
Quried 1668277 distinct coordinates with hll aggregates for GeoHash precision 6
Quried 4179536 distinct coordinates with hll aggregates for GeoHash precision 7
Quried 8164054 distinct coordinates with hll aggregates for GeoHash precision 8
Quried 11514949 distinct coordinates with hll aggregates for GeoHash precision 9
Quried 12517634 distinct coordinates with hll aggregates for GeoHash precision 10

Get total statistics

Union of all hll sets for postcount, userdays and usercount.

In [14]:
file_path = Path('Figure2_data') / 'total-statistics.csv'
if file_path.exists():
    # fast read from intermediate data
    df_total = pd.read_csv(file_path, index_col=0)
else:
    # query from db
    db_sql = f'''
    SELECT hll_cardinality(hll_union_agg(post_hll))::int as postcount, 
           hll_cardinality(hll_union_agg(date_hll))::int as userdays,
           hll_cardinality(hll_union_agg(user_hll))::int as usercount
    FROM spatial.latlng_01
    '''
    df_total = db_conn_hlldb.query(db_sql)
    # store intermediate data
    df_total.to_csv(
        file_path, mode='w', 
        index=True, header=True)
# show results
df_total.head()
Out[14]:
postcount userdays usercount
0 48440022 5970374 211078

Get the number of total distinct coordinates from raw database (without GeoHash aggregation):

In [13]:
file_path = Path('Figure2_data') / 'distinct-coordinates-yfcc.csv'
if file_path.exists():
    # fast read from intermediate data
    df_distinct_coords = pd.read_csv(file_path, index_col=0)
else:
    # query from db
    db_sql = f'''
    SELECT COUNT(DISTINCT(ST_Y(post_latlng), ST_X(post_latlng)))
    FROM topical.post
    WHERE post_geoaccuracy IN ('place', 'latlng', 'city');
    '''
    df_distinct_coords = db_conn_rawdb.query(db_sql)
    # store intermediate data
    df_distinct_coords.to_csv(
        file_path, mode='w', 
        index=True, header=True)
df_distinct_coords.head()
Out[13]:
count
0 12692723

Get share of user data (partially) hidden in the crowd

For usercount

In [15]:
for geohash_precision in range(1, 11):
    df_tmp = db_conn_hlldb.query(
        f'''
        SELECT 
            {geohash_precision:02d} as geohash,
            hll_cardinality(hll_union_agg(a.user_hll))::int as sum_usercount
        FROM (SELECT 
                  s.user_hll as user_hll
              FROM spatial.latlng_{geohash_precision:02d} s
              WHERE hll_cardinality(s.user_hll)::int = 1) a
        ''')
    display(df_tmp.head())
    df_tmp['sum_usercount'].fillna(0, inplace=True)
    print(
        f"At GeoHash precision {geohash_precision}, \n"
        f"{df_tmp['sum_usercount'][0]} of distinct users are "
        f"found at spatial outlier locations "
        f"(of {df_total['usercount'][0]} total users, "
        f"{df_tmp['sum_usercount'][0]/(df_total['usercount'][0]/100):.0f}%).")
    mode = 'a'
    header = False
    if geohash_precision == 1:
        mode = 'w' 
        header = True
    df_tmp.to_csv(
        Path('Figure2_data') / f'outlier-stats-geohash-user.csv', 
        mode=mode, index=False, header=header)
geohash sum_usercount
0 1 None
At GeoHash precision 1, 
0 of distinct users are found at spatial outlier locations (of 211078 total users, 0%).
geohash sum_usercount
0 2 124
At GeoHash precision 2, 
124 of distinct users are found at spatial outlier locations (of 211078 total users, 0%).
geohash sum_usercount
0 3 1256
At GeoHash precision 3, 
1256 of distinct users are found at spatial outlier locations (of 211078 total users, 1%).
geohash sum_usercount
0 4 9483
At GeoHash precision 4, 
9483 of distinct users are found at spatial outlier locations (of 211078 total users, 4%).
geohash sum_usercount
0 5 50358
At GeoHash precision 5, 
50358 of distinct users are found at spatial outlier locations (of 211078 total users, 24%).
geohash sum_usercount
0 6 112175
At GeoHash precision 6, 
112175 of distinct users are found at spatial outlier locations (of 211078 total users, 53%).
geohash sum_usercount
0 7 158853
At GeoHash precision 7, 
158853 of distinct users are found at spatial outlier locations (of 211078 total users, 75%).
geohash sum_usercount
0 8 182825
At GeoHash precision 8, 
182825 of distinct users are found at spatial outlier locations (of 211078 total users, 87%).
geohash sum_usercount
0 9 195680
At GeoHash precision 9, 
195680 of distinct users are found at spatial outlier locations (of 211078 total users, 93%).
geohash sum_usercount
0 10 199581
At GeoHash precision 10, 
199581 of distinct users are found at spatial outlier locations (of 211078 total users, 95%).

For postcount

In [16]:
for geohash_precision in range(1, 11):
    df_tmp = db_conn_hlldb.query(
        f'''
        SELECT 
            {geohash_precision:02d} as geohash,
            hll_cardinality(hll_union_agg(a.post_hll))::int as sum_postcount
        FROM (SELECT 
                  s.post_hll as post_hll
              FROM spatial.latlng_{geohash_precision:02d} s
              WHERE hll_cardinality(s.post_hll)::int = 1) a
        ''')
    display(df_tmp.head())
    df_tmp['sum_postcount'].fillna(0, inplace=True)
    print(
        f"At GeoHash precision {geohash_precision}, \n"
        f"{df_tmp['sum_postcount'][0]} of distinct posts are "
        f"found at spatial outlier locations "
        f"(of {df_total['postcount'][0]} total posts, "
        f"{df_tmp['sum_postcount'][0]/(df_total['postcount'][0]/100):.0f}%).")
    mode = 'a'
    header = False
    if geohash_precision == 1:
        mode = 'w' 
        header = True
    df_tmp.to_csv(
        Path('Figure2_data') / f'outlier-stats-geohash-posts.csv', 
        mode=mode, index=False, header=header)
geohash sum_postcount
0 1 None
At GeoHash precision 1, 
0 of distinct posts are found at spatial outlier locations (of 48440022 total posts, 0%).
geohash sum_postcount
0 2 96
At GeoHash precision 2, 
96 of distinct posts are found at spatial outlier locations (of 48440022 total posts, 0%).
geohash sum_postcount
0 3 1141
At GeoHash precision 3, 
1141 of distinct posts are found at spatial outlier locations (of 48440022 total posts, 0%).
geohash sum_postcount
0 4 12067
At GeoHash precision 4, 
12067 of distinct posts are found at spatial outlier locations (of 48440022 total posts, 0%).
geohash sum_postcount
0 5 109477
At GeoHash precision 5, 
109477 of distinct posts are found at spatial outlier locations (of 48440022 total posts, 0%).
geohash sum_postcount
0 6 588075
At GeoHash precision 6, 
588075 of distinct posts are found at spatial outlier locations (of 48440022 total posts, 1%).
geohash sum_postcount
0 7 1927963
At GeoHash precision 7, 
1927963 of distinct posts are found at spatial outlier locations (of 48440022 total posts, 4%).
geohash sum_postcount
0 8 4890672
At GeoHash precision 8, 
4890672 of distinct posts are found at spatial outlier locations (of 48440022 total posts, 10%).
geohash sum_postcount
0 9 8444880
At GeoHash precision 9, 
8444880 of distinct posts are found at spatial outlier locations (of 48440022 total posts, 17%).
geohash sum_postcount
0 10 9689164
At GeoHash precision 10, 
9689164 of distinct posts are found at spatial outlier locations (of 48440022 total posts, 20%).

For userdays

In [17]:
for geohash_precision in range(1, 11):
    df_tmp = db_conn_hlldb.query(
        f'''
        SELECT 
            {geohash_precision:02d} as geohash,
            hll_cardinality(hll_union_agg(a.date_hll))::int as sum_userdays
        FROM (SELECT 
                  s.date_hll as date_hll
              FROM spatial.latlng_{geohash_precision:02d} s
              WHERE hll_cardinality(s.date_hll)::int = 1) a
        ''')
    display(df_tmp.head())
    df_tmp['sum_userdays'].fillna(0, inplace=True)
    print(
        f"At GeoHash precision {geohash_precision}, \n"
        f"{df_tmp['sum_userdays'][0]} of distinct userdays are "
        f"found at spatial outlier locations "
        f"(of {df_total['userdays'][0]} total userdays, "
        f"{df_tmp['sum_userdays'][0]/(df_total['userdays'][0]/100):.0f}%).")
    mode = 'a'
    header = False
    if geohash_precision == 1:
        mode = 'w' 
        header = True
    df_tmp.to_csv(
        Path('Figure2_data') / f'outlier-stats-geohash-userdays.csv', 
        mode=mode, index=False, header=header)
geohash sum_userdays
0 1 None
At GeoHash precision 1, 
0 of distinct userdays are found at spatial outlier locations (of 5970374 total userdays, 0%).
geohash sum_userdays
0 2 111
At GeoHash precision 2, 
111 of distinct userdays are found at spatial outlier locations (of 5970374 total userdays, 0%).
geohash sum_userdays
0 3 1495
At GeoHash precision 3, 
1495 of distinct userdays are found at spatial outlier locations (of 5970374 total userdays, 0%).
geohash sum_userdays
0 4 15443
At GeoHash precision 4, 
15443 of distinct userdays are found at spatial outlier locations (of 5970374 total userdays, 0%).
geohash sum_userdays
0 5 130393
At GeoHash precision 5, 
130393 of distinct userdays are found at spatial outlier locations (of 5970374 total userdays, 2%).
geohash sum_userdays
0 6 593232
At GeoHash precision 6, 
593232 of distinct userdays are found at spatial outlier locations (of 5970374 total userdays, 10%).
geohash sum_userdays
0 7 1495497
At GeoHash precision 7, 
1495497 of distinct userdays are found at spatial outlier locations (of 5970374 total userdays, 25%).
geohash sum_userdays
0 8 2815392
At GeoHash precision 8, 
2815392 of distinct userdays are found at spatial outlier locations (of 5970374 total userdays, 47%).
geohash sum_userdays
0 9 3667371
At GeoHash precision 9, 
3667371 of distinct userdays are found at spatial outlier locations (of 5970374 total userdays, 61%).
geohash sum_userdays
0 10 3818572
At GeoHash precision 10, 
3818572 of distinct userdays are found at spatial outlier locations (of 5970374 total userdays, 64%).

Visualization of results

Correlation between GeoHash Accuracy and Number of distinct locations

In [18]:
home_path = Path.cwd() / 'Figure2_data'
all_files = home_path.glob('*_spatial_latlng_cardinality.csv')

dtypes = {'user_hll': int}

li = []
for filename in sorted(all_files):
    df = pd.read_csv(
        filename, index_col=None, dtype=dtypes, encoding='utf-8')
    li.append(df)

df = pd.concat(li, axis=1, ignore_index=True)
# sort values
df = df.apply(lambda x: x.sort_values(ascending=False).values)
# name index
df.rename_axis('cardinality', inplace=True)
In [19]:
df.head()
Out[19]:
0 1 2 3 4 5 6 7 8 9
cardinality
0 69701.0 33910.0 20418.0 16451.0 11329.0 4935.0 2652.0 1357.0 1337.0 1326
1 58624.0 32846.0 17482.0 14482.0 10673.0 4374.0 1718.0 1187.0 1155.0 1105
2 44857.0 29204.0 13965.0 11675.0 9110.0 4259.0 1643.0 1066.0 977.0 977
3 39845.0 27001.0 12430.0 10396.0 7707.0 3854.0 1543.0 864.0 829.0 817
4 38387.0 24979.0 8818.0 9496.0 7614.0 3739.0 1535.0 782.0 747.0 747

Visualize the basics

In [20]:
distinct_coords = []
for ix in range(0, 10):
    distinct_coords.append(
        (ix+1, len(df[ix].dropna())))
# append total distinct coords in dataset, 
# without geohash
distinct_coords.append(
    (11, df_distinct_coords["count"][0]))
display(distinct_coords)
[(1, 32),
 (2, 739),
 (3, 8173),
 (4, 73517),
 (5, 451949),
 (6, 1668277),
 (7, 4179536),
 (8, 8164054),
 (9, 11514949),
 (10, 12517634),
 (11, 12692723)]
In [21]:
print(db_cursor_hlldb.statusmessage)
SELECT 1
In [22]:
# create DataFrame using data 
df_counts = pd.DataFrame(
    distinct_coords, columns =['GeoHash Accuracy', 'Distinct Coordinates']) 
In [23]:
sns.relplot(x="GeoHash Accuracy", y="Distinct Coordinates", 
            ci="sd", kind="line", data=df_counts, markers=True, marker='o');
plt.ticklabel_format(style='plain', axis='y', useOffset=False)
ax = plt.gca()
# x = decreasing GeoHash accuracy
ax.set_xlim(11, 0)
Out[23]:
(11.0, 0.0)

Interpolate & style matplotlib

In [24]:
total_distinct_coordinates = df_distinct_coords["count"][0]
df_counts["Distinct Coordinates Perc"] = df_counts["Distinct Coordinates"] / (total_distinct_coordinates/100)
df_counts["Distinct Coordinates"] = df_counts["Distinct Coordinates"]/1000000
df_counts.head()
Out[24]:
GeoHash Accuracy Distinct Coordinates Distinct Coordinates Perc
0 1 0.000032 0.000252
1 2 0.000739 0.005822
2 3 0.008173 0.064391
3 4 0.073517 0.579206
4 5 0.451949 3.560694
In [25]:
# set style
plt.style.use('fivethirtyeight')
# plt.style.use('ggplot')
# plt.style.use('seaborn')
f1 = interp1d(
    x=df_counts["GeoHash Accuracy"], 
    # y=df_counts["Distinct Coordinates"], 
    y=df_counts["Distinct Coordinates Perc"], 
    kind='cubic', fill_value="extrapolate")
xnew = np.linspace(0, 11, num=40, endpoint=True)
plt.ticklabel_format(style='plain', axis='y', useOffset=False)
plt.plot(xnew, f1(xnew), '--', df_counts["GeoHash Accuracy"], df_counts["Distinct Coordinates Perc"],'o')
plt.xticks(np.arange(min(xnew), max(xnew)+1, 1.0))
ax = plt.gca()
# label markers
labels = [f'{i:.2f}' for i in df_counts["Distinct Coordinates Perc"]]
for label, x, y in zip(labels, df_counts["GeoHash Accuracy"], df_counts["Distinct Coordinates Perc"]):
    if float(label) > 0.01:
        plt.annotate(label, xy=(x, y), xytext=(x-0.2, y+0.2))
# general formatting        
ax.set_xlim(10, 0)  # decreasing accuracy
ax.set_xlabel("GeoHash Precision")
ax.set_ylabel("Distinct Coordinates \n (Percentage)")
# plt.gcf().subplots_adjust(bottom=0.15, left=0.15)
plt.tight_layout()
plt.savefig("GeoHash_accuracy_yfcc_coordinates.png", dpi=300)
plt.show()

Percentage of user-coordinate outliers (1-user-cardinality records)

How many distinct coordinates at GeoHash lv 5?

In [26]:
len(df[5].dropna())
Out[26]:
1668277

How many at lv 9?

In [27]:
len(df[9].dropna())
Out[27]:
12517634

How many distinct coordinates at GeoHash lv 10 have a user cardinality of 1? (=frequented by 1 user)

In [28]:
df_temp = df[9].dropna()
len(df_temp[df_temp == 1])
Out[28]:
12326103

... almost 100% of coordinates are frequented by only 1 user at the highest GeoHash Level.

Calculate user-coordinate outliers for all GeoHash levels

Drop NaN rows and calculate total percentage of user-coordinate outliers per GeoHash Level

In [29]:
li_counts = []
for ix in range(0,10):
    df_temp = df[ix].dropna()
    li_counts.append((ix+1, len(df_temp[df_temp == 1])/(len(df_temp)/100)))
In [30]:
li_counts
Out[30]:
[(1, 0.0),
 (2, 20.56833558863329),
 (3, 26.04918634528325),
 (4, 34.511745582654356),
 (5, 50.00165947927753),
 (6, 68.25191499972726),
 (7, 80.02074871468986),
 (8, 87.49490143009834),
 (9, 95.71464884473218),
 (10, 98.46991052782019)]
In [31]:
df_counts_user_outlier = pd.DataFrame(li_counts, columns =['GeoHash Accuracy', 'counts']) 
In [32]:
# set style
plt.style.use('fivethirtyeight')
# plt.style.use('ggplot')
# plt.style.use('seaborn')
f1 = interp1d(
    x=df_counts_user_outlier["GeoHash Accuracy"], 
    y=df_counts_user_outlier["counts"], 
    kind='cubic', fill_value="extrapolate")
xnew = np.linspace(0, 10, num=40, endpoint=True)
plt.ticklabel_format(style='plain', axis='y', useOffset=False)
plt.plot(xnew, f1(xnew), '--', df_counts_user_outlier["GeoHash Accuracy"], df_counts_user_outlier["counts"],'o')
plt.xticks(np.arange(min(xnew), max(xnew)+1, 1.0))
ax = plt.gca()
# label markers
labels = [f'{i:.2f}' for i in df_counts_user_outlier["counts"]]
for label, x, y in zip(labels, df_counts_user_outlier["GeoHash Accuracy"], df_counts_user_outlier["counts"]):
    if float(label) > 0.01:
        plt.annotate(f'{label}%', xy=(x, y), xytext=(x-0.2, y+0.2))
# general formatting        
ax.set_xlim(10, 0)  # decreasing accuracy
ax.set_ylim(0, 100)  # decreasing accuracy
ax.set_xlabel("GeoHash Precision")
ax.set_ylabel('1-Caradinality Cluster \n (Percentage)')
# plt.gcf().subplots_adjust(bottom=0.15, left=0.15)
plt.tight_layout()
plt.savefig("GeoHash_accuracy_yfcc_coordinates_perc-1-d.png", dpi=300)
plt.show()

Percentage of spatial outlier volume for different metrics (user, posts, userdays)

The evaluation above verifies that user-coordinate outliers are highly identifiable information. However, only few posts of a user are shared at locations which are highly unique (distinct). Most often, it can be expected that a user will visit highly unique locations (that haven't been shared online) only once or a few times. We can expect that users will primarily frequent locations where also other people have been. Therefore, a significant share of user-locations users should disappear "in the crowd". From a privacy perspective, this is important because such behaviour patterns prevent attackers from gaining a full picture of all locations visited by a single user. Similar effects should be expected for userdays and posts. Lets have a look at the numbers below:

The following file includes estimated unioned counts for all data that is clustered, i.e. that is not in a 1-d coordinate with only 1 count

In [33]:
base_path = (
    Path.cwd() / 
    'Figure2_data')
file_path_post1d = (
    base_path / 
    'outlier-stats-geohash-posts.csv')
file_path_userday1d = (
    base_path / 
    'outlier-stats-geohash-userdays.csv')
file_path_user1d = (
    base_path / 
    'outlier-stats-geohash-user.csv')

dtypes = {'sum_usercount': int,
          'sum_postcount': int,
          'sum_userdays': int}
# .iloc[::-1] will reverse order of index
# descending geohash lvs
df_post1d_sum = pd.read_csv(
    file_path_post1d, index_col="geohash",
    dtype=dtypes, encoding='utf-8').iloc[::-1]
df_userday1d_sum = pd.read_csv(
    file_path_userday1d, index_col="geohash",
    dtype=dtypes, encoding='utf-8').iloc[::-1]
df_user1d_sum = pd.read_csv(
    file_path_user1d, index_col="geohash",
    dtype=dtypes, encoding='utf-8').iloc[::-1]
In [34]:
df_post1d_sum.head()
Out[34]:
sum_postcount
geohash
10 9689164
9 8444880
8 4890672
7 1927963
6 588075
In [35]:
li_volume_post = []
total_sum = df_total['postcount'][0] 
print(f"Total geotagged postcount in YFCC: {total_sum}")
for ix in range(0,10):
    li_volume_post.append(
        (ix+1, df_post1d_sum["sum_postcount"][ix+1]/(total_sum/100)))
li_volume_post
Total geotagged postcount in YFCC: 48440022
Out[35]:
[(1, 0.0),
 (2, 0.00019818322956170417),
 (3, 0.002355490259686505),
 (4, 0.02491121907417796),
 (5, 0.22600526482006966),
 (6, 1.214027111713533),
 (7, 3.9801034772444988),
 (8, 10.096345538406238),
 (9, 17.43368324646921),
 (10, 20.002393888260414)]
In [36]:
li_volume_user = []
total_sum = df_total['usercount'][0] 
print(f"Total usercount (geotagged YFCC data): {total_sum}")
for ix in range(0, 10):
    li_volume_user.append(
        (ix+1, df_user1d_sum["sum_usercount"][ix+1]/(total_sum/100)))
li_volume_user
Total usercount (geotagged YFCC data): 211078
Out[36]:
[(1, 0.0),
 (2, 0.058746055960355884),
 (3, 0.595040695856508),
 (4, 4.492652005419797),
 (5, 23.857531339125817),
 (6, 53.143861510910654),
 (7, 75.25796151185817),
 (8, 86.61490065283922),
 (9, 92.70506637356806),
 (10, 94.55319834374022)]
In [37]:
li_volume_userdays = []
total_sum = df_total['userdays'][0] 
print(f"Total userdays (geotagged YFCC data): {total_sum}")
for ix in range(0,10):
    li_volume_userdays.append(
        (ix+1, df_userday1d_sum["sum_userdays"][ix+1]/(total_sum/100)))
li_volume_userdays
Total userdays (geotagged YFCC data): 5970374
Out[37]:
[(1, 0.0),
 (2, 0.0018591800111684795),
 (3, 0.025040307357629524),
 (4, 0.25866051272499846),
 (5, 2.184000533299924),
 (6, 9.936261949418915),
 (7, 25.048631794256107),
 (8, 47.15604081084368),
 (9, 61.42615186251314),
 (10, 63.95867327574454)]
In [38]:
df_volume_post = pd.DataFrame(
    li_volume_post, columns =['GeoHash Accuracy', 'counts']) 
df_volume_user = pd.DataFrame(
    li_volume_user, columns =['GeoHash Accuracy', 'counts']) 
df_volume_userdays = pd.DataFrame(
    li_volume_userdays, columns =['GeoHash Accuracy', 'counts']) 

Plot together

In [39]:
# set style
plt.style.use('fivethirtyeight')
# plt.style.use('ggplot')
# plt.style.use('seaborn')
f1 = interp1d(
    x=df_counts_user_outlier["GeoHash Accuracy"], 
    y=df_counts_user_outlier["counts"], 
    kind='cubic', fill_value="extrapolate")
f2 = interp1d(
    x=df_volume_post["GeoHash Accuracy"], 
    y=df_volume_post["counts"], 
    kind='cubic', fill_value="extrapolate")
f3 = interp1d(
    x=df_volume_user["GeoHash Accuracy"], 
    y=df_volume_user["counts"], 
    kind='cubic', fill_value="extrapolate")
f4 = interp1d(
    x=df_volume_userdays["GeoHash Accuracy"], 
    y=df_volume_userdays["counts"], 
    kind='cubic', fill_value="extrapolate")
f5 = interp1d(
    x=df_counts["GeoHash Accuracy"], 
    y=df_counts["Distinct Coordinates Perc"], 
    kind='cubic', fill_value="extrapolate")
xnew = np.linspace(1, 10, num=40, endpoint=True)
plt.ticklabel_format(style='plain', axis='y', useOffset=False)
plt.axvline(5, 0, 100, label='Chosen geo-privacy threshold', linestyle=':', alpha=0.5)

marker_plotting_opts = {
    "color": "white",
    "markersize": 8,
    "markeredgewidth": 0.5,
    "markeredgecolor": "grey"}
# ['008fd5', 'fc4f30', 'e5ae38', '6d904f', '8b8b8b', '810f7c']
plt.plot(xnew, f1(xnew), '--', label='User-Coordinate outliers', color="#fc4f30")
plt.plot(xnew, f5(xnew), '--', label='Coordinate outliers', color="#008fd5")
plt.plot(xnew, f3(xnew), '--', label='User outliers', color="#6d904f") 
plt.plot(xnew, f4(xnew), '--', label='Userday outliers', color="#e5ae38") 
plt.plot(xnew, f2(xnew), '--', label='Post outliers', color="#810f7c")
plt.plot(df_counts_user_outlier["GeoHash Accuracy"], df_counts_user_outlier["counts"], 'o', **marker_plotting_opts)
plt.plot(df_volume_post["GeoHash Accuracy"], df_volume_post["counts"], 'o', **marker_plotting_opts)
plt.plot(df_volume_user["GeoHash Accuracy"], df_volume_user["counts"], 'o', **marker_plotting_opts)
plt.plot(df_volume_userdays["GeoHash Accuracy"], df_volume_userdays["counts"], 'o', **marker_plotting_opts)
plt.plot(df_counts["GeoHash Accuracy"], df_counts["Distinct Coordinates Perc"], 'o', **marker_plotting_opts)

plt.xticks(np.arange(min(xnew), max(xnew)+1, 1.0))
ax = plt.gca()
fig = plt.gcf()
fig.set_size_inches(10, 5)
fig.set_facecolor('white')
# fig.suptitle('Volume of global outliers', fontsize=20, color='grey')
ax.legend()
# label markers
labels = [f'{i:.1f}' for i in df_counts_user_outlier["counts"]]
for label, x, y in zip(labels, df_counts_user_outlier["GeoHash Accuracy"], df_counts_user_outlier["counts"]):
    if float(label) > 0.01:
        offset_x = -0.5
        if x == 5:
            offset_x = 1.2
        plt.annotate(f'{label}%', xy=(x, y), xytext=(x+offset_x, y+0.2))
        
# labels = [f'{i:.2f}' for i in df_volume_post["counts"]]
# for label, x, y in zip(labels, df_volume_post["GeoHash Accuracy"], df_volume_post["counts"]):
#     if float(label) < 99:
#         plt.annotate(f'{label}%', xy=(x, y), xytext=(x-0.2, y-0.8))
# general formatting
ax.set_xlim(10.2, 0)  # decreasing accuracy
ax.set_ylim(-3, 101)  # decreasing accuracy
ax.set_xlabel("GeoHash Precision")
ax.set_ylabel('Percentage')
ax.xaxis.label.set_color('grey')
ax.yaxis.label.set_color('grey')
ax.set_facecolor('white')
# ax.grid(which='minor', alpha=0.2)
# ax.grid(which='major', alpha=0.5)
ax.grid(color='grey', linestyle='-', linewidth=0.5)

import matplotlib as mpl
mpl.rcParams["savefig.facecolor"] = 'white'

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.tick_params(axis='x', colors='grey')
ax.tick_params(axis='y', colors='grey')

plt.gcf().subplots_adjust(bottom=0.15, left=0.15)
# plt.tight_layout()
plt.savefig("GeoHash_accuracy_yfcc_coordinates_perc-1-d_combined_notitle.png", dpi=300)
plt.show()

Additional statistics

The privacy-preserving effect of HLL depends on the size of HLL sets, with small HLL sets with only one user ID being most vulnerable to intersection attacks.

How many small sets exist in a dataset depends on the overall distribution of data. Data can be clustered or dispersed. In Social Media, many data will feature a long tailed, heavy clustered pattern, enhancing user privacy at crowded places.

These assertions can be quantitatively substantiated with the given dataset.

How many distinct coordinates exist in the raw dataset?

In [12]:
sql_query = """
SELECT COUNT(DISTINCT(ST_Y(post_latlng), ST_X(post_latlng)))
FROM topical.post
WHERE post_geoaccuracy IN ('place', 'latlng', 'city');
"""
query_df = db_conn_rawdb.query(sql_query)
display(query_df.head())
Out[12]:
count
0 12692723

How many distinct user ids exist in the raw dataset?

In [20]:
sql_query = f"""
CREATE MATERIALIZED VIEW IF NOT EXISTS mviews.user_0 AS
    SELECT
        extensions.crypt_hash(
            t1.user_guid, '{crypt_key}') as "user_guid"
    FROM topical.post t1;
"""
db_cursor_rawdb.execute(sql_query)
print(db_cursor_rawdb.statusmessage)
SELECT 100000000
In [21]:
sql_query = f"""
SELECT COUNT(DISTINCT(user_guid))
FROM mviews.user_0 t1;
"""
query_df = db_conn_rawdb.query(sql_query)
display(query_df.head())
count
0 581099
In [26]:
db_connection_hlldb.rollback()

If those user ids are stored as individual HLL sets, how many distinct HLL sets exist (a naïve direct count of the binary hll strings)?

In [27]:
sql_query = """
SELECT EXISTS (
   SELECT FROM information_schema.tables 
   WHERE  table_schema = 'mviews'
   AND    table_name   = 'user_0'
   );
"""
result = db_conn_hlldb.query(sql_query)
display(result.empty)
False
In [28]:
if result.empty:
    sql_query = f"""
    IMPORT FOREIGN SCHEMA mviews LIMIT TO (
    user_0
    ) FROM SERVER lbsnraw INTO extensions;
    """
    db_cursor_hlldb.execute(sql_query)
    print(db_cursor_hlldb.statusmessage)
In [29]:
sql_query = f"""
SELECT hll_set_defaults(11, 5, 0, 1);
"""
db_cursor_hlldb.execute(sql_query)
print(db_cursor_hlldb.statusmessage)
SELECT 1
In [30]:
sql_query = f"""
CREATE MATERIALIZED VIEW mviews.user_0_hll AS
    SELECT
        hll_add(hll_empty(), hll_hash_text(user_guid))::text AS user_hll
    FROM extensions.user_0;
"""
db_cursor_hlldb.execute(sql_query)
print(db_cursor_hlldb.statusmessage)
SELECT 100000000
In [37]:
sql_query = f"""
SELECT COUNT(DISTINCT(user_hll))
FROM mviews.user_0_hll t1;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())
count
0 17358

If the register width and log2m HLL parameters are reduced, how does it affect the total count distinct of HLL sets?

In [32]:
sql_query = f"""
SELECT hll_set_defaults(10, 4, 0, 1);
CREATE MATERIALIZED VIEW mviews.user_0_hll_v2 AS
    SELECT
        hll_add(hll_empty(), hll_hash_text(user_guid))::text AS user_hll
    FROM extensions.user_0;
"""
db_cursor_hlldb.execute(sql_query)
print(db_cursor_hlldb.statusmessage)
SELECT 100000000

Store changes made so far:

In [38]:
db_connection_hlldb.commit()

Total user estimate for different register width and log2m parameters:

log2m=11 and regwidth=5

In [33]:
sql_query = f"""
SELECT
    hll_cardinality(
        hll_union_agg(
            user_hll::hll))::int AS usercount
FROM mviews.user_0_hll;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())
usercount
0 589475

log2m=10 and regwidth=4

In [34]:
sql_query = f"""
SELECT
    hll_cardinality(
        hll_union_agg(
            user_hll::hll))::int AS usercount
FROM mviews.user_0_hll_v2;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())
usercount
0 597234

Total estimated statistics for postcount, userdays, coordinatescount (Table 2)

Coordinates:

In [45]:
sql_query = f"""
CREATE MATERIALIZED VIEW IF NOT EXISTS mviews.coordinate_0 AS
    SELECT
        (ST_Y(post_latlng)::text || ST_X(post_latlng)::text) AS "coordinate"
    FROM topical.post t1;
"""
db_cursor_rawdb.execute(sql_query)
print(db_cursor_rawdb.statusmessage)
CREATE MATERIALIZED VIEW
In [46]:
sql_query = """
SELECT EXISTS (
   SELECT FROM information_schema.tables 
   WHERE  table_schema = 'extensions'
   AND    table_name   = 'coordinate_0'
   );
"""
result = db_conn_hlldb.query(sql_query)
display(result.empty)
False
In [49]:
if result.loc[0] is False:
    sql_query = f"""
    IMPORT FOREIGN SCHEMA mviews LIMIT TO (
        coordinate_0
    ) FROM SERVER lbsnraw INTO extensions;
    """
    db_cursor_hlldb.execute(sql_query)
    print(db_cursor_hlldb.statusmessage)

sql_query = f"""
SELECT hll_set_defaults(11, 5, 0, 1);
SELECT
    hll_cardinality(
        hll_add_agg(
                hll_hash_text(coordinate)
            )
        )::int AS coordinatecount
FROM extensions.coordinate_0;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())
coordinatecount
0 12756691

Postcount:

In [50]:
sql_query = f"""
CREATE MATERIALIZED VIEW IF NOT EXISTS mviews.post_0 AS
    SELECT
        extensions.crypt_hash(
            t1.post_guid, '{crypt_key}') as "post_guid"
    FROM topical.post t1;
"""
db_cursor_rawdb.execute(sql_query)
print(db_cursor_rawdb.statusmessage)
CREATE MATERIALIZED VIEW

rawdb count:

In [51]:
sql_query = f"""
SELECT COUNT(DISTINCT(post_guid))
FROM mviews.post_0;
"""
query_df = db_conn_rawdb.query(sql_query)
display(query_df.head())
count
0 100000000

hlldb count:

In [54]:
sql_query = """
SELECT EXISTS (
   SELECT FROM information_schema.tables 
   WHERE  table_schema = 'extensions'
   AND    table_name   = 'post_0'
   );
"""
result = db_conn_hlldb.query(sql_query)
display(result)
exists
0 True
In [59]:
if result.loc[0] is False:
    sql_query = f"""
    IMPORT FOREIGN SCHEMA mviews LIMIT TO (
        post_0
    ) FROM SERVER lbsnraw INTO extensions;
    """
    db_cursor_hlldb.execute(sql_query)
    print(db_cursor_hlldb.statusmessage)
    
sql_query = f"""
SELECT hll_set_defaults(11, 5, 0, 1);

SELECT
    hll_cardinality(
        hll_add_agg(
                hll_hash_text(post_guid)
            )
        )::int AS postcount
FROM extensions.post_0;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())
postcount
0 98553392

User day count:

In [60]:
sql_query = f"""
CREATE MATERIALIZED VIEW IF NOT EXISTS mviews.userday_0 AS
    SELECT
        extensions.crypt_hash(
            (user_guid || to_char(t1.post_create_date, 'yyyy-MM-dd')),
            '{crypt_key}') AS "user_day"
    FROM topical.post t1;
"""
db_cursor_rawdb.execute(sql_query)
print(db_cursor_rawdb.statusmessage)
CREATE MATERIALIZED VIEW

rawdb count:

In [61]:
sql_query = f"""
SELECT COUNT(DISTINCT(user_day))
FROM mviews.userday_0;
"""
query_df = db_conn_rawdb.query(sql_query)
display(query_df.head())
count
0 17662780

hlldb count:

In [62]:
sql_query = """
SELECT EXISTS (
   SELECT FROM information_schema.tables 
   WHERE  table_schema = 'extensions'
   AND    table_name   = 'userday_0'
   );
"""
result = db_conn_hlldb.query(sql_query)
display(result.empty)
False
In [63]:
if result.loc[0] is False:
    sql_query = f"""
    IMPORT FOREIGN SCHEMA mviews LIMIT TO (
        userday_0
    ) FROM SERVER lbsnraw INTO extensions;
    """
    db_cursor_hlldb.execute(sql_query)
    print(db_cursor_hlldb.statusmessage)
    

sql_query = f"""
SELECT hll_set_defaults(11, 5, 0, 1);

SELECT
    hll_cardinality(
        hll_add_agg(
                hll_hash_text(user_day)
            )
        )::int AS userdays
FROM extensions.userday_0;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())
userdays
0 17678373
In [64]:
db_connection_hlldb.commit()

Additional counts of Single-Item-HLL sets

Total number of single item HLL sets for Geohash 5, which reflects the chosen initial granularity of spatial data at collection time.

In [72]:
sql_query = f"""
SELECT COUNT(*)
FROM spatial.latlng
WHERE hll_cardinality(user_hll)::int = 1;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())
count
0 226025
In [83]:
print(
    f"{query_df.loc[0]['count']} locations contain only 1 user id at a GeoHash=5. "
    f"This represents {query_df.loc[0]['count']/(12756691 / 100):.2f}% of "
    f"total distinct locations in the YFCC100M dataset")
226025 locations contain only 1 user id at a GeoHash=5. This represents 1.77% of total distinct locations in the YFCC100M dataset

How many distinct users are present in these single-item-hll sets?

In [87]:
sql_query = f"""
SELECT hll_cardinality(hll_union_agg(user_hll))::int
FROM spatial.latlng
WHERE hll_cardinality(user_hll)::int = 1;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())
hll_cardinality
0 50358
In [88]:
print(
    f"{query_df.loc[0]['hll_cardinality']} users have visited at least one location where no other user has been at GeoHash=5. "
    f"This represents {query_df.loc[0]['hll_cardinality']/(597234 / 100):.2f}% of "
    f"total users in the YFCC100M dataset")
50358 users have visited at least one location where no other user has been at GeoHash=5. This represents 8.43% of total users in the YFCC100M dataset

Total number of single item HLL sets for 100km bin collection, which reflects the chosen granularity of spatial data at data publish time (db internal).

In [11]:
# load union_all_hll() methods from _04_interpretation notebook
from _04_interpretation import *
In [12]:
grid = grid_agg_fromcsv(
    OUTPUT / "csv" / "yfcc_all_est_benchmark_internal.csv",
    columns=["xbin", "ybin", "usercount_hll"],
    metrics=["usercount_est"],
    grid_size=GRID_SIZE_METERS)
In [13]:
len(grid[grid["usercount_est"] == 1])
Out[13]:
3354
In [14]:
len(grid[grid["usercount_est"] > 1])
Out[14]:
9235
In [15]:
print(
    f"{len(grid[grid['usercount_est'] == 1])} grid bins (100km) have a user count cardinality of 1. "
    f"This represents {len(grid[grid['usercount_est'] == 1])/(len(grid[grid['usercount_est'] >= 1]) / 100):.2f}% of "
    f"grid bins with data")
3354 grid bins (100km) have a user count cardinality of 1. This represents 26.64% of grid bins with data

However, how many distinct users are present in these single-item-hll sets of 100km bins?

In [17]:
cardinality_series = union_all_hll(
        grid[grid["usercount_est"] == 1]["usercount_hll"].dropna(),
        db_conn = db_conn_hlldb)
In [18]:
cardinality_series[0]
Out[18]:
1833
In [19]:
print(
    f"{cardinality_series[0]} users have at least one grid bin (100km) where no other user has been. "
    f"This represents {cardinality_series[0]/(597234 / 100):.2f}% of "
    f"total users in the YFCC100M dataset")
1833 users have at least one grid bin (100km) where no other user has been. This represents 0.31% of total users in the YFCC100M dataset

Published benchmark data (user count >= 100): How many total posts are included in bins >= 10000 post count (high privacy preserving effect according to Desfontaines):

In [20]:
grid = grid_agg_fromcsv(
    OUTPUT / "csv" / "yfcc_all_est_benchmark_internal.csv",
    columns=["xbin", "ybin", "postcount_hll"],
    metrics=["postcount_est"],
    grid_size=GRID_SIZE_METERS)
In [21]:
cardinality_series = union_all_hll(
        grid[grid["postcount_est"] >= 10000]["postcount_hll"].dropna(),
        db_conn = db_conn_hlldb)
In [24]:
cardinality_series[0]
Out[24]:
41582251

48,469,829 total geotagged posts in YFCC100M dataset with chosen

In [25]:
print(
    f"{cardinality_series[0]} of posts are from bins (100km) with >= 10000 total posts. "
    f"This represents {cardinality_series[0]/(48469829 / 100):.2f}% of "
    f"total posts in the YFCC100M dataset")
41582251 of posts are from bins (100km) with >= 10000 total posts. This represents 85.79% of total posts in the YFCC100M dataset

Convert notebook to html

In [26]:
!jupyter nbconvert --to html_toc \
    --output-dir=../out/html ./Figure2_outlier_analysis.ipynb \
    --template=../nbconvert.tpl \
    --ExtractOutputPreprocessor.enabled=False # create single output file
[NbConvertApp] Converting notebook ./Figure2_outlier_analysis.ipynb to html_toc
[NbConvertApp] Writing 706338 bytes to ../out/html/Figure2_outlier_analysis.html
In [ ]: