Import dependencies
import os, sys
import psycopg2 # Postgres API
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from pathlib import Path
from IPython.display import clear_output, display, HTML, Markdown
from scipy.interpolate import interp1d
module_path = str(Path.cwd().parents[0] / "py")
if module_path not in sys.path:
sys.path.append(module_path)
from modules import tools, preparations
# define path to output directory (figures etc.)
OUTPUT = Path.cwd().parents[0] / "out" / 'Figure2_data'
OUTPUT.mkdir(exist_ok=True)
Load keys from .env
import os
from dotenv import load_dotenv
dotenv_path = Path.cwd().parents[0] / '.env'
load_dotenv(dotenv_path)
db_user = "postgres"
db_pass = os.getenv('POSTGRES_PASSWORD')
crypt_key = os.getenv("CRYPT_KEY")
read_only_user_key = os.getenv("USER_KEY")
db_connection_hlldb = psycopg2.connect(
host="hlldb",
port="5432",
dbname="hlldb",
user=db_user,
password=db_pass
)
db_connection_rawdb = psycopg2.connect(
host="rawdb",
port="5432",
dbname="rawdb",
user=db_user,
password=db_pass
)
Test connection:
db_query = """
SELECT 1;
"""
# create pandas DataFrame from database data
df = pd.read_sql_query(db_query, db_connection_hlldb)
display(df.head())
Simplify query by storing procedure:
db_conn_hlldb = tools.DbConn(db_connection_hlldb)
db_conn_hlldb.query("SELECT 1;")
db_conn_rawdb = tools.DbConn(db_connection_rawdb)
db_conn_rawdb.query("SELECT 1;")
If we only want to execute queries, without returning results:
db_cursor_hlldb = db_connection_hlldb.cursor()
db_cursor_hlldb.execute("SELECT 1;")
db_cursor_hlldb.fetchone()
db_cursor_rawdb = db_connection_rawdb.cursor()
db_cursor_rawdb.execute("SELECT 1;")
db_cursor_rawdb.fetchone()
In any case of sql error, rollback manually with:
db_connection_hlldb.rollback()
db_connection_rawdb.rollback()
db_sql = f'''CREATE SCHEMA IF NOT EXISTS mviews;
ALTER DATABASE rawdb
SET search_path = "$user",
social,
spatial,
temporal,
topical,
interlinkage,
extensions,
mviews;
CREATE EXTENSION IF NOT EXISTS pgcrypto SCHEMA extensions;
'''
db_cursor_rawdb.execute(db_sql)
print(db_cursor_rawdb.statusmessage)
# commit changes to db
db_connection_rawdb.commit()
db_sql = f'''/* Produce pseudonymized hash of input id with skey
* - using skey as seed value
* - sha256 cryptographic hash function
* - encode in base64 to reduce length of hash
* - remove trailing '=' from base64 string
* - return as text
*/
CREATE OR REPLACE FUNCTION
extensions.crypt_hash (id text, skey text)
RETURNS text
AS $$
SELECT
RTRIM(
ENCODE(
HMAC(
id::bytea,
skey::bytea,
'sha256'),
'base64'),
'=')
$$
LANGUAGE SQL
STRICT;
'''
db_cursor_rawdb.execute(db_sql)
print(db_cursor_rawdb.statusmessage)
# commit changes to db
db_connection_rawdb.commit()
for geohash_precision in range(8, 11):
db_sql = f'''
DROP MATERIALIZED VIEW
mviews.spatiallatlng_raw_geohash_{geohash_precision:02d};
'''
db_cursor_rawdb.execute(db_sql)
print(db_cursor_rawdb.statusmessage)
# commit changes to db
db_connection_rawdb.commit()
The following queries take quite some time:
%%time
for geohash_precision in range(8, 11):
db_sql = f'''
CREATE MATERIALIZED VIEW mviews.spatiallatlng_raw_geohash_{geohash_precision:02d} AS
SELECT extensions.crypt_hash(t1.post_guid, '{crypt_key}') as "post_guid",
ST_Y(ST_PointFromGeoHash(ST_GeoHash(t1.post_latlng, {geohash_precision}), {geohash_precision})) As "latitude",
ST_X(ST_PointFromGeoHash(ST_GeoHash(t1.post_latlng, {geohash_precision}), {geohash_precision})) As "longitude",
extensions.crypt_hash(t1.user_guid, '{crypt_key}') as "user_guid",
to_char(t1.post_create_date, 'yyyy-MM-dd') as "post_create_date",
t1.post_geoaccuracy
FROM topical.post t1
WHERE t1.post_geoaccuracy IN ('place', 'latlng', 'city');
'''
db_cursor_rawdb.execute(db_sql)
print(db_cursor_rawdb.statusmessage)
# commit changes to db
db_connection_rawdb.commit()
On rawdb, create read_only user:
db_sql = f'''
-- cleanup
DROP SERVER IF EXISTS lbsnraw CASCADE;
DROP OWNED BY lbsn_reader;
DROP USER IF EXISTS lbsn_reader;
CREATE USER lbsn_reader WITH
LOGIN
INHERIT
PASSWORD '{read_only_user_key}';
GRANT CONNECT ON DATABASE rawdb TO lbsn_reader;
GRANT USAGE ON SCHEMA mviews TO lbsn_reader;
GRANT SELECT ON ALL TABLES IN SCHEMA mviews TO lbsn_reader;
GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA mviews TO lbsn_reader;
'''
db_cursor_rawdb.execute(db_sql)
print(db_cursor_rawdb.statusmessage)
# commit changes to db
db_connection_rawdb.commit()
On hlldb, enable fdw extension anc create connection:
db_sql = f'''
CREATE EXTENSION IF NOT EXISTS postgres_fdw SCHEMA extensions;
CREATE SERVER IF NOT EXISTS lbsnraw FOREIGN DATA WRAPPER postgres_fdw OPTIONS (
host 'rawdb',
dbname 'rawdb',
port '5432',
keepalives '1',
keepalives_idle '30',
keepalives_interval '10',
keepalives_count '5',
fetch_size '500000');
DROP USER MAPPING IF EXISTS for postgres SERVER lbsnraw;
CREATE USER MAPPING for postgres
SERVER lbsnraw
OPTIONS (user 'lbsn_reader', password '{read_only_user_key}');
'''
db_cursor_hlldb.execute(db_sql)
print(db_cursor_hlldb.statusmessage)
# commit changes to db
db_connection_hlldb.commit()
table_list = ','.join(
[f'spatiallatlng_raw_geohash_{geohash_precision:02d}' for geohash_precision in range(1, 11)])
db_sql = f'''
DROP FOREIGN TABLE {table_list};
IMPORT FOREIGN SCHEMA mviews
LIMIT TO (
{table_list})
FROM SERVER lbsnraw INTO extensions;
'''
db_cursor_hlldb.execute(db_sql)
print(db_cursor_hlldb.statusmessage)
# commit changes to db
db_connection_hlldb.commit()
Prepare structure for hll aggregate data per coordinate and GeoHash Level
Make sure output data doesn't exist already:
for geohash_precision in range(8, 11):
db_sql = f'''
TRUNCATE spatial.latlng_{geohash_precision:02d};
'''
db_cursor_hlldb.execute(db_sql)
print(db_cursor_hlldb.statusmessage)
# commit changes to db
db_connection_hlldb.commit()
Create structure:
for geohash_precision in range(8, 11):
db_sql = f'''
CREATE TABLE IF NOT EXISTS spatial.latlng_{geohash_precision:02d} (
latitude float,
longitude float,
PRIMARY KEY (latitude, longitude),
latlng_geom geometry(Point, 4326) NOT NULL)
INHERITS (
social.user_hll, -- e.g. number of users/latlng (=UPL)
topical.post_hll, -- e.g. number of posts/latlng
temporal.date_hll -- e.g. number of userdays/latlng (=PUD)
);
'''
db_cursor_hlldb.execute(db_sql)
print(db_cursor_hlldb.statusmessage)
db_connection_hlldb
for geohash_precision in range(8, 11):
db_sql = f'''
INSERT INTO spatial.latlng_{geohash_precision:02d}(latitude, longitude, user_hll, post_hll, date_hll, latlng_geom)
SELECT latitude,
longitude,
hll_add_agg(hll_hash_text(user_guid)) as user_hll,
hll_add_agg(hll_hash_text(post_guid)) as post_hll,
hll_add_agg(hll_hash_text(user_guid || post_create_date)) as date_hll,
ST_SetSRID(ST_MakePoint(longitude, latitude), 4326) as latlng_geom
FROM extensions.spatiallatlng_raw_geohash_{geohash_precision:02d}
GROUP BY latitude, longitude;
'''
db_cursor_hlldb.execute(db_sql)
print(db_cursor_hlldb.statusmessage)
db_connection_hlldb.commit()
for geohash_precision in range(1, 11):
df_tmp = db_conn_hlldb.query(
f'''
SELECT
hll_cardinality(user_hll)::int as user_hll
FROM spatial.latlng_{geohash_precision:02d}
''')
print(
f"Queried {len(df_tmp)} distinct coordinates with hll "
f"aggregates (usercount) for GeoHash precision {geohash_precision}")
df_tmp.to_csv(
Path('Figure2_data') / f'geohash{geohash_precision:02d}_spatial_latlng_cardinality.csv',
mode='w', index=False, header=True)
Union of all hll sets for postcount, userdays and usercount.
file_path = Path('Figure2_data') / 'total-statistics.csv'
if file_path.exists():
# fast read from intermediate data
df_total = pd.read_csv(file_path, index_col=0)
else:
# query from db
db_sql = f'''
SELECT hll_cardinality(hll_union_agg(post_hll))::int as postcount,
hll_cardinality(hll_union_agg(date_hll))::int as userdays,
hll_cardinality(hll_union_agg(user_hll))::int as usercount
FROM spatial.latlng_01
'''
df_total = db_conn_hlldb.query(db_sql)
# store intermediate data
df_total.to_csv(
file_path, mode='w',
index=True, header=True)
# show results
df_total.head()
Get the number of total distinct coordinates from raw database (without GeoHash aggregation):
file_path = Path('Figure2_data') / 'distinct-coordinates-yfcc.csv'
if file_path.exists():
# fast read from intermediate data
df_distinct_coords = pd.read_csv(file_path, index_col=0)
else:
# query from db
db_sql = f'''
SELECT COUNT(DISTINCT(ST_Y(post_latlng), ST_X(post_latlng)))
FROM topical.post
WHERE post_geoaccuracy IN ('place', 'latlng', 'city');
'''
df_distinct_coords = db_conn_rawdb.query(db_sql)
# store intermediate data
df_distinct_coords.to_csv(
file_path, mode='w',
index=True, header=True)
df_distinct_coords.head()
For usercount
for geohash_precision in range(1, 11):
df_tmp = db_conn_hlldb.query(
f'''
SELECT
{geohash_precision:02d} as geohash,
hll_cardinality(hll_union_agg(a.user_hll))::int as sum_usercount
FROM (SELECT
s.user_hll as user_hll
FROM spatial.latlng_{geohash_precision:02d} s
WHERE hll_cardinality(s.user_hll)::int = 1) a
''')
display(df_tmp.head())
df_tmp['sum_usercount'].fillna(0, inplace=True)
print(
f"At GeoHash precision {geohash_precision}, \n"
f"{df_tmp['sum_usercount'][0]} of distinct users are "
f"found at spatial outlier locations "
f"(of {df_total['usercount'][0]} total users, "
f"{df_tmp['sum_usercount'][0]/(df_total['usercount'][0]/100):.0f}%).")
mode = 'a'
header = False
if geohash_precision == 1:
mode = 'w'
header = True
df_tmp.to_csv(
Path('Figure2_data') / f'outlier-stats-geohash-user.csv',
mode=mode, index=False, header=header)
For postcount
for geohash_precision in range(1, 11):
df_tmp = db_conn_hlldb.query(
f'''
SELECT
{geohash_precision:02d} as geohash,
hll_cardinality(hll_union_agg(a.post_hll))::int as sum_postcount
FROM (SELECT
s.post_hll as post_hll
FROM spatial.latlng_{geohash_precision:02d} s
WHERE hll_cardinality(s.post_hll)::int = 1) a
''')
display(df_tmp.head())
df_tmp['sum_postcount'].fillna(0, inplace=True)
print(
f"At GeoHash precision {geohash_precision}, \n"
f"{df_tmp['sum_postcount'][0]} of distinct posts are "
f"found at spatial outlier locations "
f"(of {df_total['postcount'][0]} total posts, "
f"{df_tmp['sum_postcount'][0]/(df_total['postcount'][0]/100):.0f}%).")
mode = 'a'
header = False
if geohash_precision == 1:
mode = 'w'
header = True
df_tmp.to_csv(
Path('Figure2_data') / f'outlier-stats-geohash-posts.csv',
mode=mode, index=False, header=header)
For userdays
for geohash_precision in range(1, 11):
df_tmp = db_conn_hlldb.query(
f'''
SELECT
{geohash_precision:02d} as geohash,
hll_cardinality(hll_union_agg(a.date_hll))::int as sum_userdays
FROM (SELECT
s.date_hll as date_hll
FROM spatial.latlng_{geohash_precision:02d} s
WHERE hll_cardinality(s.date_hll)::int = 1) a
''')
display(df_tmp.head())
df_tmp['sum_userdays'].fillna(0, inplace=True)
print(
f"At GeoHash precision {geohash_precision}, \n"
f"{df_tmp['sum_userdays'][0]} of distinct userdays are "
f"found at spatial outlier locations "
f"(of {df_total['userdays'][0]} total userdays, "
f"{df_tmp['sum_userdays'][0]/(df_total['userdays'][0]/100):.0f}%).")
mode = 'a'
header = False
if geohash_precision == 1:
mode = 'w'
header = True
df_tmp.to_csv(
Path('Figure2_data') / f'outlier-stats-geohash-userdays.csv',
mode=mode, index=False, header=header)
home_path = Path.cwd() / 'Figure2_data'
all_files = home_path.glob('*_spatial_latlng_cardinality.csv')
dtypes = {'user_hll': int}
li = []
for filename in sorted(all_files):
df = pd.read_csv(
filename, index_col=None, dtype=dtypes, encoding='utf-8')
li.append(df)
df = pd.concat(li, axis=1, ignore_index=True)
# sort values
df = df.apply(lambda x: x.sort_values(ascending=False).values)
# name index
df.rename_axis('cardinality', inplace=True)
df.head()
distinct_coords = []
for ix in range(0, 10):
distinct_coords.append(
(ix+1, len(df[ix].dropna())))
# append total distinct coords in dataset,
# without geohash
distinct_coords.append(
(11, df_distinct_coords["count"][0]))
display(distinct_coords)
print(db_cursor_hlldb.statusmessage)
# create DataFrame using data
df_counts = pd.DataFrame(
distinct_coords, columns =['GeoHash Accuracy', 'Distinct Coordinates'])
sns.relplot(x="GeoHash Accuracy", y="Distinct Coordinates",
ci="sd", kind="line", data=df_counts, markers=True, marker='o');
plt.ticklabel_format(style='plain', axis='y', useOffset=False)
ax = plt.gca()
# x = decreasing GeoHash accuracy
ax.set_xlim(11, 0)
Interpolate & style matplotlib
total_distinct_coordinates = df_distinct_coords["count"][0]
df_counts["Distinct Coordinates Perc"] = df_counts["Distinct Coordinates"] / (total_distinct_coordinates/100)
df_counts["Distinct Coordinates"] = df_counts["Distinct Coordinates"]/1000000
df_counts.head()
# set style
plt.style.use('fivethirtyeight')
# plt.style.use('ggplot')
# plt.style.use('seaborn')
f1 = interp1d(
x=df_counts["GeoHash Accuracy"],
# y=df_counts["Distinct Coordinates"],
y=df_counts["Distinct Coordinates Perc"],
kind='cubic', fill_value="extrapolate")
xnew = np.linspace(0, 11, num=40, endpoint=True)
plt.ticklabel_format(style='plain', axis='y', useOffset=False)
plt.plot(xnew, f1(xnew), '--', df_counts["GeoHash Accuracy"], df_counts["Distinct Coordinates Perc"],'o')
plt.xticks(np.arange(min(xnew), max(xnew)+1, 1.0))
ax = plt.gca()
# label markers
labels = [f'{i:.2f}' for i in df_counts["Distinct Coordinates Perc"]]
for label, x, y in zip(labels, df_counts["GeoHash Accuracy"], df_counts["Distinct Coordinates Perc"]):
if float(label) > 0.01:
plt.annotate(label, xy=(x, y), xytext=(x-0.2, y+0.2))
# general formatting
ax.set_xlim(10, 0) # decreasing accuracy
ax.set_xlabel("GeoHash Precision")
ax.set_ylabel("Distinct Coordinates \n (Percentage)")
# plt.gcf().subplots_adjust(bottom=0.15, left=0.15)
plt.tight_layout()
plt.savefig("GeoHash_accuracy_yfcc_coordinates.png", dpi=300)
plt.show()
How many distinct coordinates at GeoHash lv 5?
len(df[5].dropna())
How many at lv 9?
len(df[9].dropna())
How many distinct coordinates at GeoHash lv 10 have a user cardinality of 1? (=frequented by 1 user)
df_temp = df[9].dropna()
len(df_temp[df_temp == 1])
... almost 100% of coordinates are frequented by only 1 user at the highest GeoHash Level.
Calculate user-coordinate outliers for all GeoHash levels
Drop NaN rows and calculate total percentage of user-coordinate outliers per GeoHash Level
li_counts = []
for ix in range(0,10):
df_temp = df[ix].dropna()
li_counts.append((ix+1, len(df_temp[df_temp == 1])/(len(df_temp)/100)))
li_counts
df_counts_user_outlier = pd.DataFrame(li_counts, columns =['GeoHash Accuracy', 'counts'])
# set style
plt.style.use('fivethirtyeight')
# plt.style.use('ggplot')
# plt.style.use('seaborn')
f1 = interp1d(
x=df_counts_user_outlier["GeoHash Accuracy"],
y=df_counts_user_outlier["counts"],
kind='cubic', fill_value="extrapolate")
xnew = np.linspace(0, 10, num=40, endpoint=True)
plt.ticklabel_format(style='plain', axis='y', useOffset=False)
plt.plot(xnew, f1(xnew), '--', df_counts_user_outlier["GeoHash Accuracy"], df_counts_user_outlier["counts"],'o')
plt.xticks(np.arange(min(xnew), max(xnew)+1, 1.0))
ax = plt.gca()
# label markers
labels = [f'{i:.2f}' for i in df_counts_user_outlier["counts"]]
for label, x, y in zip(labels, df_counts_user_outlier["GeoHash Accuracy"], df_counts_user_outlier["counts"]):
if float(label) > 0.01:
plt.annotate(f'{label}%', xy=(x, y), xytext=(x-0.2, y+0.2))
# general formatting
ax.set_xlim(10, 0) # decreasing accuracy
ax.set_ylim(0, 100) # decreasing accuracy
ax.set_xlabel("GeoHash Precision")
ax.set_ylabel('1-Caradinality Cluster \n (Percentage)')
# plt.gcf().subplots_adjust(bottom=0.15, left=0.15)
plt.tight_layout()
plt.savefig("GeoHash_accuracy_yfcc_coordinates_perc-1-d.png", dpi=300)
plt.show()
The evaluation above verifies that user-coordinate outliers are highly identifiable information. However, only few posts of a user are shared at locations which are highly unique (distinct). Most often, it can be expected that a user will visit highly unique locations (that haven't been shared online) only once or a few times. We can expect that users will primarily frequent locations where also other people have been. Therefore, a significant share of user-locations users should disappear "in the crowd". From a privacy perspective, this is important because such behaviour patterns prevent attackers from gaining a full picture of all locations visited by a single user. Similar effects should be expected for userdays and posts. Lets have a look at the numbers below:
The following file includes estimated unioned counts for all data that is clustered, i.e. that is not in a 1-d coordinate with only 1 count
base_path = (
Path.cwd() /
'Figure2_data')
file_path_post1d = (
base_path /
'outlier-stats-geohash-posts.csv')
file_path_userday1d = (
base_path /
'outlier-stats-geohash-userdays.csv')
file_path_user1d = (
base_path /
'outlier-stats-geohash-user.csv')
dtypes = {'sum_usercount': int,
'sum_postcount': int,
'sum_userdays': int}
# .iloc[::-1] will reverse order of index
# descending geohash lvs
df_post1d_sum = pd.read_csv(
file_path_post1d, index_col="geohash",
dtype=dtypes, encoding='utf-8').iloc[::-1]
df_userday1d_sum = pd.read_csv(
file_path_userday1d, index_col="geohash",
dtype=dtypes, encoding='utf-8').iloc[::-1]
df_user1d_sum = pd.read_csv(
file_path_user1d, index_col="geohash",
dtype=dtypes, encoding='utf-8').iloc[::-1]
df_post1d_sum.head()
li_volume_post = []
total_sum = df_total['postcount'][0]
print(f"Total geotagged postcount in YFCC: {total_sum}")
for ix in range(0,10):
li_volume_post.append(
(ix+1, df_post1d_sum["sum_postcount"][ix+1]/(total_sum/100)))
li_volume_post
li_volume_user = []
total_sum = df_total['usercount'][0]
print(f"Total usercount (geotagged YFCC data): {total_sum}")
for ix in range(0, 10):
li_volume_user.append(
(ix+1, df_user1d_sum["sum_usercount"][ix+1]/(total_sum/100)))
li_volume_user
li_volume_userdays = []
total_sum = df_total['userdays'][0]
print(f"Total userdays (geotagged YFCC data): {total_sum}")
for ix in range(0,10):
li_volume_userdays.append(
(ix+1, df_userday1d_sum["sum_userdays"][ix+1]/(total_sum/100)))
li_volume_userdays
df_volume_post = pd.DataFrame(
li_volume_post, columns =['GeoHash Accuracy', 'counts'])
df_volume_user = pd.DataFrame(
li_volume_user, columns =['GeoHash Accuracy', 'counts'])
df_volume_userdays = pd.DataFrame(
li_volume_userdays, columns =['GeoHash Accuracy', 'counts'])
Plot together
# set style
plt.style.use('fivethirtyeight')
# plt.style.use('ggplot')
# plt.style.use('seaborn')
f1 = interp1d(
x=df_counts_user_outlier["GeoHash Accuracy"],
y=df_counts_user_outlier["counts"],
kind='cubic', fill_value="extrapolate")
f2 = interp1d(
x=df_volume_post["GeoHash Accuracy"],
y=df_volume_post["counts"],
kind='cubic', fill_value="extrapolate")
f3 = interp1d(
x=df_volume_user["GeoHash Accuracy"],
y=df_volume_user["counts"],
kind='cubic', fill_value="extrapolate")
f4 = interp1d(
x=df_volume_userdays["GeoHash Accuracy"],
y=df_volume_userdays["counts"],
kind='cubic', fill_value="extrapolate")
f5 = interp1d(
x=df_counts["GeoHash Accuracy"],
y=df_counts["Distinct Coordinates Perc"],
kind='cubic', fill_value="extrapolate")
xnew = np.linspace(1, 10, num=40, endpoint=True)
plt.ticklabel_format(style='plain', axis='y', useOffset=False)
plt.axvline(5, 0, 100, label='Chosen geo-privacy threshold', linestyle=':', alpha=0.5)
marker_plotting_opts = {
"color": "white",
"markersize": 8,
"markeredgewidth": 0.5,
"markeredgecolor": "grey"}
# ['008fd5', 'fc4f30', 'e5ae38', '6d904f', '8b8b8b', '810f7c']
plt.plot(xnew, f1(xnew), '--', label='User-Coordinate outliers', color="#fc4f30")
plt.plot(xnew, f5(xnew), '--', label='Coordinate outliers', color="#008fd5")
plt.plot(xnew, f3(xnew), '--', label='User outliers', color="#6d904f")
plt.plot(xnew, f4(xnew), '--', label='Userday outliers', color="#e5ae38")
plt.plot(xnew, f2(xnew), '--', label='Post outliers', color="#810f7c")
plt.plot(df_counts_user_outlier["GeoHash Accuracy"], df_counts_user_outlier["counts"], 'o', **marker_plotting_opts)
plt.plot(df_volume_post["GeoHash Accuracy"], df_volume_post["counts"], 'o', **marker_plotting_opts)
plt.plot(df_volume_user["GeoHash Accuracy"], df_volume_user["counts"], 'o', **marker_plotting_opts)
plt.plot(df_volume_userdays["GeoHash Accuracy"], df_volume_userdays["counts"], 'o', **marker_plotting_opts)
plt.plot(df_counts["GeoHash Accuracy"], df_counts["Distinct Coordinates Perc"], 'o', **marker_plotting_opts)
plt.xticks(np.arange(min(xnew), max(xnew)+1, 1.0))
ax = plt.gca()
fig = plt.gcf()
fig.set_size_inches(10, 5)
fig.set_facecolor('white')
# fig.suptitle('Volume of global outliers', fontsize=20, color='grey')
ax.legend()
# label markers
labels = [f'{i:.1f}' for i in df_counts_user_outlier["counts"]]
for label, x, y in zip(labels, df_counts_user_outlier["GeoHash Accuracy"], df_counts_user_outlier["counts"]):
if float(label) > 0.01:
offset_x = -0.5
if x == 5:
offset_x = 1.2
plt.annotate(f'{label}%', xy=(x, y), xytext=(x+offset_x, y+0.2))
# labels = [f'{i:.2f}' for i in df_volume_post["counts"]]
# for label, x, y in zip(labels, df_volume_post["GeoHash Accuracy"], df_volume_post["counts"]):
# if float(label) < 99:
# plt.annotate(f'{label}%', xy=(x, y), xytext=(x-0.2, y-0.8))
# general formatting
ax.set_xlim(10.2, 0) # decreasing accuracy
ax.set_ylim(-3, 101) # decreasing accuracy
ax.set_xlabel("GeoHash Precision")
ax.set_ylabel('Percentage')
ax.xaxis.label.set_color('grey')
ax.yaxis.label.set_color('grey')
ax.set_facecolor('white')
# ax.grid(which='minor', alpha=0.2)
# ax.grid(which='major', alpha=0.5)
ax.grid(color='grey', linestyle='-', linewidth=0.5)
import matplotlib as mpl
mpl.rcParams["savefig.facecolor"] = 'white'
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.tick_params(axis='x', colors='grey')
ax.tick_params(axis='y', colors='grey')
plt.gcf().subplots_adjust(bottom=0.15, left=0.15)
# plt.tight_layout()
plt.savefig("GeoHash_accuracy_yfcc_coordinates_perc-1-d_combined_notitle.png", dpi=300)
plt.show()
The privacy-preserving effect of HLL depends on the size of HLL sets, with small HLL sets with only one user ID being most vulnerable to intersection attacks.
How many small sets exist in a dataset depends on the overall distribution of data. Data can be clustered or dispersed. In Social Media, many data will feature a long tailed, heavy clustered pattern, enhancing user privacy at crowded places.
These assertions can be quantitatively substantiated with the given dataset.
How many distinct coordinates exist in the raw dataset?
sql_query = """
SELECT COUNT(DISTINCT(ST_Y(post_latlng), ST_X(post_latlng)))
FROM topical.post
WHERE post_geoaccuracy IN ('place', 'latlng', 'city');
"""
query_df = db_conn_rawdb.query(sql_query)
display(query_df.head())
How many distinct user ids exist in the raw dataset?
sql_query = f"""
CREATE MATERIALIZED VIEW IF NOT EXISTS mviews.user_0 AS
SELECT
extensions.crypt_hash(
t1.user_guid, '{crypt_key}') as "user_guid"
FROM topical.post t1;
"""
db_cursor_rawdb.execute(sql_query)
print(db_cursor_rawdb.statusmessage)
sql_query = f"""
SELECT COUNT(DISTINCT(user_guid))
FROM mviews.user_0 t1;
"""
query_df = db_conn_rawdb.query(sql_query)
display(query_df.head())
db_connection_hlldb.rollback()
If those user ids are stored as individual HLL sets, how many distinct HLL sets exist (a naïve direct count of the binary hll strings)?
sql_query = """
SELECT EXISTS (
SELECT FROM information_schema.tables
WHERE table_schema = 'mviews'
AND table_name = 'user_0'
);
"""
result = db_conn_hlldb.query(sql_query)
display(result.empty)
if result.empty:
sql_query = f"""
IMPORT FOREIGN SCHEMA mviews LIMIT TO (
user_0
) FROM SERVER lbsnraw INTO extensions;
"""
db_cursor_hlldb.execute(sql_query)
print(db_cursor_hlldb.statusmessage)
sql_query = f"""
SELECT hll_set_defaults(11, 5, 0, 1);
"""
db_cursor_hlldb.execute(sql_query)
print(db_cursor_hlldb.statusmessage)
sql_query = f"""
CREATE MATERIALIZED VIEW mviews.user_0_hll AS
SELECT
hll_add(hll_empty(), hll_hash_text(user_guid))::text AS user_hll
FROM extensions.user_0;
"""
db_cursor_hlldb.execute(sql_query)
print(db_cursor_hlldb.statusmessage)
sql_query = f"""
SELECT COUNT(DISTINCT(user_hll))
FROM mviews.user_0_hll t1;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())
If the register width and log2m HLL parameters are reduced, how does it affect the total count distinct of HLL sets?
sql_query = f"""
SELECT hll_set_defaults(10, 4, 0, 1);
CREATE MATERIALIZED VIEW mviews.user_0_hll_v2 AS
SELECT
hll_add(hll_empty(), hll_hash_text(user_guid))::text AS user_hll
FROM extensions.user_0;
"""
db_cursor_hlldb.execute(sql_query)
print(db_cursor_hlldb.statusmessage)
Store changes made so far:
db_connection_hlldb.commit()
Total user estimate for different register width and log2m parameters:
log2m=11 and regwidth=5
sql_query = f"""
SELECT
hll_cardinality(
hll_union_agg(
user_hll::hll))::int AS usercount
FROM mviews.user_0_hll;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())
log2m=10 and regwidth=4
sql_query = f"""
SELECT
hll_cardinality(
hll_union_agg(
user_hll::hll))::int AS usercount
FROM mviews.user_0_hll_v2;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())
Total estimated statistics for postcount, userdays, coordinatescount (Table 2)
Coordinates:
sql_query = f"""
CREATE MATERIALIZED VIEW IF NOT EXISTS mviews.coordinate_0 AS
SELECT
(ST_Y(post_latlng)::text || ST_X(post_latlng)::text) AS "coordinate"
FROM topical.post t1;
"""
db_cursor_rawdb.execute(sql_query)
print(db_cursor_rawdb.statusmessage)
sql_query = """
SELECT EXISTS (
SELECT FROM information_schema.tables
WHERE table_schema = 'extensions'
AND table_name = 'coordinate_0'
);
"""
result = db_conn_hlldb.query(sql_query)
display(result.empty)
if result.loc[0] is False:
sql_query = f"""
IMPORT FOREIGN SCHEMA mviews LIMIT TO (
coordinate_0
) FROM SERVER lbsnraw INTO extensions;
"""
db_cursor_hlldb.execute(sql_query)
print(db_cursor_hlldb.statusmessage)
sql_query = f"""
SELECT hll_set_defaults(11, 5, 0, 1);
SELECT
hll_cardinality(
hll_add_agg(
hll_hash_text(coordinate)
)
)::int AS coordinatecount
FROM extensions.coordinate_0;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())
Postcount:
sql_query = f"""
CREATE MATERIALIZED VIEW IF NOT EXISTS mviews.post_0 AS
SELECT
extensions.crypt_hash(
t1.post_guid, '{crypt_key}') as "post_guid"
FROM topical.post t1;
"""
db_cursor_rawdb.execute(sql_query)
print(db_cursor_rawdb.statusmessage)
rawdb count:
sql_query = f"""
SELECT COUNT(DISTINCT(post_guid))
FROM mviews.post_0;
"""
query_df = db_conn_rawdb.query(sql_query)
display(query_df.head())
hlldb count:
sql_query = """
SELECT EXISTS (
SELECT FROM information_schema.tables
WHERE table_schema = 'extensions'
AND table_name = 'post_0'
);
"""
result = db_conn_hlldb.query(sql_query)
display(result)
if result.loc[0] is False:
sql_query = f"""
IMPORT FOREIGN SCHEMA mviews LIMIT TO (
post_0
) FROM SERVER lbsnraw INTO extensions;
"""
db_cursor_hlldb.execute(sql_query)
print(db_cursor_hlldb.statusmessage)
sql_query = f"""
SELECT hll_set_defaults(11, 5, 0, 1);
SELECT
hll_cardinality(
hll_add_agg(
hll_hash_text(post_guid)
)
)::int AS postcount
FROM extensions.post_0;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())
User day count:
sql_query = f"""
CREATE MATERIALIZED VIEW IF NOT EXISTS mviews.userday_0 AS
SELECT
extensions.crypt_hash(
(user_guid || to_char(t1.post_create_date, 'yyyy-MM-dd')),
'{crypt_key}') AS "user_day"
FROM topical.post t1;
"""
db_cursor_rawdb.execute(sql_query)
print(db_cursor_rawdb.statusmessage)
rawdb count:
sql_query = f"""
SELECT COUNT(DISTINCT(user_day))
FROM mviews.userday_0;
"""
query_df = db_conn_rawdb.query(sql_query)
display(query_df.head())
hlldb count:
sql_query = """
SELECT EXISTS (
SELECT FROM information_schema.tables
WHERE table_schema = 'extensions'
AND table_name = 'userday_0'
);
"""
result = db_conn_hlldb.query(sql_query)
display(result.empty)
if result.loc[0] is False:
sql_query = f"""
IMPORT FOREIGN SCHEMA mviews LIMIT TO (
userday_0
) FROM SERVER lbsnraw INTO extensions;
"""
db_cursor_hlldb.execute(sql_query)
print(db_cursor_hlldb.statusmessage)
sql_query = f"""
SELECT hll_set_defaults(11, 5, 0, 1);
SELECT
hll_cardinality(
hll_add_agg(
hll_hash_text(user_day)
)
)::int AS userdays
FROM extensions.userday_0;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())
db_connection_hlldb.commit()
Additional counts of Single-Item-HLL sets
Total number of single item HLL sets for Geohash 5, which reflects the chosen initial granularity of spatial data at collection time.
sql_query = f"""
SELECT COUNT(*)
FROM spatial.latlng
WHERE hll_cardinality(user_hll)::int = 1;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())
print(
f"{query_df.loc[0]['count']} locations contain only 1 user id at a GeoHash=5. "
f"This represents {query_df.loc[0]['count']/(12756691 / 100):.2f}% of "
f"total distinct locations in the YFCC100M dataset")
How many distinct users are present in these single-item-hll sets?
sql_query = f"""
SELECT hll_cardinality(hll_union_agg(user_hll))::int
FROM spatial.latlng
WHERE hll_cardinality(user_hll)::int = 1;
"""
query_df = db_conn_hlldb.query(sql_query)
display(query_df.head())
print(
f"{query_df.loc[0]['hll_cardinality']} users have visited at least one location where no other user has been at GeoHash=5. "
f"This represents {query_df.loc[0]['hll_cardinality']/(597234 / 100):.2f}% of "
f"total users in the YFCC100M dataset")
Total number of single item HLL sets for 100km bin collection, which reflects the chosen granularity of spatial data at data publish time (db internal).
# load union_all_hll() methods from _04_interpretation notebook
from _04_interpretation import *
grid = grid_agg_fromcsv(
OUTPUT / "csv" / "yfcc_all_est_benchmark_internal.csv",
columns=["xbin", "ybin", "usercount_hll"],
metrics=["usercount_est"],
grid_size=GRID_SIZE_METERS)
len(grid[grid["usercount_est"] == 1])
len(grid[grid["usercount_est"] > 1])
print(
f"{len(grid[grid['usercount_est'] == 1])} grid bins (100km) have a user count cardinality of 1. "
f"This represents {len(grid[grid['usercount_est'] == 1])/(len(grid[grid['usercount_est'] >= 1]) / 100):.2f}% of "
f"grid bins with data")
However, how many distinct users are present in these single-item-hll sets of 100km bins?
cardinality_series = union_all_hll(
grid[grid["usercount_est"] == 1]["usercount_hll"].dropna(),
db_conn = db_conn_hlldb)
cardinality_series[0]
print(
f"{cardinality_series[0]} users have at least one grid bin (100km) where no other user has been. "
f"This represents {cardinality_series[0]/(597234 / 100):.2f}% of "
f"total users in the YFCC100M dataset")
Published benchmark data (user count >= 100): How many total posts are included in bins >= 10000 post count (high privacy preserving effect according to Desfontaines):
grid = grid_agg_fromcsv(
OUTPUT / "csv" / "yfcc_all_est_benchmark_internal.csv",
columns=["xbin", "ybin", "postcount_hll"],
metrics=["postcount_est"],
grid_size=GRID_SIZE_METERS)
cardinality_series = union_all_hll(
grid[grid["postcount_est"] >= 10000]["postcount_hll"].dropna(),
db_conn = db_conn_hlldb)
cardinality_series[0]
48,469,829 total geotagged posts in YFCC100M dataset with chosen
print(
f"{cardinality_series[0]} of posts are from bins (100km) with >= 10000 total posts. "
f"This represents {cardinality_series[0]/(48469829 / 100):.2f}% of "
f"total posts in the YFCC100M dataset")
Convert notebook to html
!jupyter nbconvert --to html_toc \
--output-dir=../out/html ./Figure2_outlier_analysis.ipynb \
--template=../nbconvert.tpl \
--ExtractOutputPreprocessor.enabled=False # create single output file