Herein, we explore (two unlikely but possible) scenarios how privacy can be compromised using the HLL data. We distinguish two vectors:
a) an internal attack ("sandy"), where the attacker has full access to the HLL database. This is based on the full HLL data collected in Notebook 2.
b) an external attack ("robert"), where the attacker has access to the published benchmark data. This is based on the published HLL data, limiting Grid cells to bins where usercount > 100.
In both scenarios, an attacker would need additional information such as
The two examples herein use what is described as an "intersection attack" by Desfontaines et al. (2018). Intersection attacks do not provide absolute certainty, but under certain circumstances they can be used to confirm suspicion, or significantly increase knowledge, which may finally lead to compromised privacy.
We use two scenarios to demonstrate such intersection attacks, based on a user Alex, with Scenario "Sandy" and "Robert".
Alex is an actual user included in the YFCC100 dataset. He is one of the authors of this guide and published images from 2008 to 2012 under Creative Commons Licenses on Flickr. 120 of these images are geotagged. While Alex is an actual user, Scenarios "Sandy" and "Robert" are purely fictional. They are used to illustrate two specific examples of how Alex's privacy could become compromised.
# Select scenario
# SCENARIO = "robert"
SCENARIO = "sandy"
import sys
from pathlib import Path
module_path = str(Path.cwd().parents[0] / "py")
if module_path not in sys.path:
sys.path.append(module_path)
from _03_yfcc_gridagg_hll import *
Additional imports
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
Connect to raw db
db_user = "postgres"
db_pass = os.getenv('POSTGRES_PASSWORD')
# set connection variables
db_host = "rawdb"
db_port = "5432"
db_name = "rawdb"
db_connection = psycopg2.connect(
host=db_host,
port=db_port,
dbname=db_name,
user=db_user,
password=db_pass
)
db_connection.set_session(readonly=True)
db_conn = tools.DbConn(db_connection)
db_conn.query("SELECT 1;")
alex_user_id = '96117893@N05'
alex_userday = '2012-05-09'
sql_query = f"""
SELECT
t1.user_guid,
t1.post_guid,
to_char(t1.post_create_date, 'yyyy-MM-dd') as "post_create_date",
ST_Y(ST_PointFromGeoHash(ST_GeoHash(t1.post_latlng, 5), 5)) As "latitude",
ST_X(ST_PointFromGeoHash(ST_GeoHash(t1.post_latlng, 5), 5)) As "longitude"
FROM topical.post t1
WHERE user_guid = '{alex_user_id}'
AND post_geoaccuracy IN ('place', 'latlng', 'city');
"""
This can take a while:
%%time
pickle_path = OUTPUT / "pickles" / "alex_raw_locations.pkl"
if pickle_path.exists():
alex_raw = pd.read_pickle(pickle_path)
else:
alex_raw = db_conn.query(sql_query)
alex_raw.to_pickle(pickle_path)
print(len(alex_raw))
if SCENARIO == 'sandy':
alex_raw.query(
f"post_create_date == '{alex_userday}'",
inplace=True)
alex_raw.head()
geoseries_alex_locations = gp.GeoSeries(
[Point(alex_location.longitude, alex_location.latitude)
for _, alex_location in alex_raw.iterrows()], crs=CRS_WGS)
geoseries_alex_locations_proj = geoseries_alex_locations.to_crs(CRS_PROJ)
geoseries_alex_locations_proj.head()
Visualize raw locations of Alex
world = gp.read_file(
gp.datasets.get_path('naturalearth_lowres'),
crs=CRS_WGS)
world = world.to_crs(CRS_PROJ)
Add annotation layer for location labels:
sanfrancisco_coords = (Point(-122.5776844, 37.7576171), "San Francisco")
berlin_coords = (Point(13.1445531, 52.5065133), "Berlin")
caboverde_coords = (Point(-23.0733155, 16.7203123), "Cabo Verde")
df = pd.DataFrame([sanfrancisco_coords, berlin_coords, caboverde_coords], columns=["geometry", "name"])
df.head()
gdf = gp.GeoDataFrame(
df.drop(
columns=["geometry"]),
geometry=df.geometry)
gdf.crs = CRS_WGS
gdf = gdf.to_crs(CRS_PROJ)
gdf['coords'] = gdf['geometry'].apply(lambda x: x.representative_point().coords[:])
gdf['coords'] = [coords[0] for coords in gdf['coords']]
label_off = {
"San Francisco":(5500000, 1000000),
"Berlin":(4500000, 1000000),
"Cabo Verde":(4500000, -1000000)}
label_rad = {
"San Francisco":0.1,
"Berlin":0.5,
"Cabo Verde":-0.3}
def annotate_locations(
gdf: gp.GeoDataFrame, label_off: List[Tuple[int,int]] = label_off,
label_rad: List[float] = label_rad):
"""Annotate map based on a list of locations"""
for idx, row in gdf.iterrows():
plt.annotate(
text=row['name'],
xy=row['coords'],
xytext=np.subtract(row['coords'], label_off.get(row['name'])),
horizontalalignment='left',
arrowprops=dict(
arrowstyle='->',
connectionstyle=f'arc3,rad={label_rad.get(row["name"])}',
color='red'))
fig, ax = plt.subplots(1, 1, figsize=(11, 14))
geoseries_alex_locations_proj.buffer(500000).plot(
ax=ax,
facecolor="none",
edgecolor='red',
linewidth=0.2,
alpha=0.9,
label='Alex, actual locations (RAW)')
ax.axis('off')
# combine with world geometry
world.plot(
ax=ax, color='none', edgecolor='black', linewidth=0.3)
annotate_locations(gdf=gdf)
In 03_yfcc_gridagg_hll.ipynb, aggregate grid data was stored to yfcc_all_est_benchmark.csv, including hll sets with cardinality > 100. Load this data first, using functions from previous notebooks.
Read benchmark data, only loading usercount and usercount_hll columns.
benchmark_data_published = "yfcc_all_est_benchmark.csv"
benchmark_data_internal = "yfcc_all_est_benchmark_internal.csv"
Select userdays or usercount, based on chosen scenario:
if SCENARIO == "robert":
metric = "usercount"
else:
metric = "userdays"
load_opts = {
"columns":["xbin", "ybin", f"{metric}_hll"],
"metrics":[f"{metric}_est"],
"grid_size":GRID_SIZE_METERS
}
grid_internal = grid_agg_fromcsv(
OUTPUT / "csv" / benchmark_data_internal,
**load_opts)
grid_published = grid_agg_fromcsv(
OUTPUT / "csv" / benchmark_data_published,
**load_opts)
datasets = [
grid_published,
grid_internal
]
grid_published[grid_published[f"{metric}_est"]>5].head()
Connect to hll worker:
db_user = "postgres"
db_pass = os.getenv('POSTGRES_PASSWORD')
# set connection variables
db_host = "hlldb"
db_port = "5432"
db_name = "hlldb"
db_connection = psycopg2.connect(
host=db_host,
port=db_port,
dbname=db_name,
user=db_user,
password=db_pass
)
db_connection.set_session(readonly=True)
db_conn = tools.DbConn(db_connection)
db_conn.query("SELECT 1;")
We either need the HLL or the hash for an intersection attack. If the hash is given, we can recreate the HLL. Hashes can either be known if the secret key is compromised or if an attacker observes internal memory states during the streaming of values and HLL conversion. We do not publish the secret key that was used to generate HLL.
if SCENARIO == "robert":
# alex cryptographic hash
alex_hash = 'fcad382c10535ad1bfdec19651eb7ec93d6d7b9bac7566503b38f5a4f8be56e6'
else:
# alex @ date (2012-05-09) cryptographic hash
alex_hash = 'cfc0d9890bfdd66728e179c25b243b867122998bc493bd4f41739bce857d7682'
Test intersection attack using a single HLL value:
hll_val = grid_published[grid_published[f"{metric}_est"]>=1].iloc[1][f"{metric}_hll"]
Adjust hll defaults:
db_conn.query("SELECT hll_set_defaults(11, 5, 0, 1);")
Use hll_eq()
function (reference) to make equality comparison for unioned and original hll:
alex_hll = f"""
hll_add_agg(
hll_hash_text(
'{alex_hash}'))
"""
sql_query = f"""
SELECT
hll_eq(
hll_union(
{alex_hll}, '{hll_val}'::hll),
'{hll_val}'::hll) as hll_equal;
"""
result = db_conn.query(
sql_query)
result.hll_equal[0]
Repeat intersection attack for all HLL sets per grid, store results in a separate column:
for grid in datasets:
grid["alex"] = None
Test all Grid Cells
for dataset_id, grid in enumerate(datasets):
usermetric_series = grid[f"{metric}_est"].dropna()
bins_found = 0
for idx, __ in usermetric_series[usermetric_series > 0].iteritems():
hll_val = grid.loc[idx][f"{metric}_hll"]
sql_query = f"""
SELECT hll_eq(
hll_union(
{alex_hll}::hll, '{hll_val}'::hll),
'{hll_val}'::hll) as hll_equal;
"""
result = db_conn.query(
sql_query)
if result.hll_equal[0]:
bins_found += 1
clear_output(wait=True)
print(
f"Dataset {dataset_id+1} "
f"- Number of positive bins found: "
f"{bins_found}. Last positive bin index: {idx}")
grid.loc[idx, "alex"] = True
All grid cells that have been marked positive for the intersection attack contain the HLL patterns of the given hash.
grid_published[grid_published["alex"] == True]
Check how many positive grid cells exist that have a higher likeliness to be true compared to Cabo Verde (usercount: 56)
if SCENARIO == "robert":
grid_published[grid_published["alex"] == True].sort_values(by=['usercount_est'], ascending=True).head()
color_raw = "red"
color_published = "#810f7c"
color_internal = "#fc4f30"
Get positive grid cells that have only been detected with internal data (usercount < 100)
internal_additional = pd.concat(
[grid_internal[grid_internal["alex"] == True], grid_published[grid_published["alex"] == True]]
).drop_duplicates(keep=False)
internal_additional.plot()
Check how many positive grid cells exist that have a higher likeliness to be true compared to Cabo Verde (usercount: 56)
if SCENARIO == "robert":
internal_additional[internal_additional["alex"] == True].sort_values(by=['usercount_est'], ascending=True).head(30)
Combine layers into single graphic, annotate:
fig, ax = plt.subplots(1, 1, figsize=(11, 14))
internal_additional.centroid.buffer(250000).plot(
ax=ax,
facecolor=color_internal,
edgecolor=color_internal,
linewidth=1,
alpha=0.9)
grid_published[grid_published["alex"] == True].plot(
ax=ax,
facecolor=color_published,
edgecolor=color_published,
linewidth=1,
alpha=0.9)
geoseries_alex_locations_proj.buffer(500000).plot(
ax=ax,
facecolor="none",
edgecolor=color_raw,
linewidth=0.5,
alpha=0.9
)
if SCENARIO == 'sandy':
label_text_raw = f"on {alex_userday}"
drop_rows_idx = gdf.index[gdf['name'] == "Cabo Verde"].tolist()
location_label_gdf = gdf.drop(drop_rows_idx)
else:
label_text_raw = "actual locations"
drop_rows_idx = gdf.index[gdf['name'].isin(["San Francisco", "Berlin"])].tolist()
location_label_gdf = gdf.drop(drop_rows_idx)
legend_entry_raw = Line2D(
[0], [0],
markeredgecolor="red",
linestyle="None",
linewidth=0.5,
marker='o',
markerfacecolor='None',
markersize=15,
label=f"Alex, {label_text_raw} (RAW)")
external_patch = mpatches.Patch(
color=color_published,
label='Query results \non published data \n(usercount > 100)')
legend_entry_internal = Line2D(
[0], [0],
markeredgecolor=color_internal,
linestyle="None",
linewidth=0.5,
marker='o',
markerfacecolor=color_internal,
markersize=15,
label='Additional query results \nwith direct database access')
legend_entries = [
external_patch,
legend_entry_internal,
legend_entry_raw
]
plt.legend(
handles=legend_entries, loc='lower left',
frameon=False, prop={'size': 16})
# combine with world geometry
world.plot(
ax=ax, color='none', edgecolor='black', linewidth=0.3)
# fig.patch.set_visible(False)
ax.axis('off')
ax.add_artist(ax.patch)
ax.patch.set_zorder(-1)
fig.tight_layout()
annotate_locations(gdf=location_label_gdf)
fig.savefig(
OUTPUT / "figures" / f"Alex_privacy_example_{SCENARIO}.png", dpi=300, bbox_inches = 'tight',
pad_inches = 0)
plt.show()
Finalize notebook:
db_connection.close()
Convert notebook to HTML
!jupyter nbconvert --to html_toc \
--output-dir=../out/html ./Privacy_test_alex.ipynb \
--template=../nbconvert.tpl \
--ExtractOutputPreprocessor.enabled=False # create single output file