In [1]:
import pandas as pd
import numpy as np
In [3]:
# create df from a numpy array:
arr = np.random.randn(6, 4)
df1 = pd.DataFrame(arr, columns=list("ABCD"))
df1
Out[3]:
A B C D
0 -0.094947 0.830500 0.581323 -0.854719
1 -0.551840 -0.157045 1.567009 0.153931
2 -0.628956 -1.670197 2.428742 0.472240
3 0.462434 0.179272 0.522368 -0.526308
4 -0.259630 -1.069554 -2.387736 -0.869187
5 0.210141 -0.475744 -0.306078 -0.919558
In [5]:
# create df from a dict
shop_articles = {
    "weight": [10.1, 5.0, 8.3, 7.2],
    "color": ["red", "green", "blue", "transparent"],
    "availability": [False, True, True, False],
    "price": 8.99  # all have the same price
}
article_numbers = ["A107", "A108", "A109", "A110"]
df2 = pd.DataFrame(shop_articles, index=article_numbers)

df2
Out[5]:
weight color availability price
A107 10.1 red False 8.99
A108 5.0 green True 8.99
A109 8.3 blue True 8.99
A110 7.2 transparent False 8.99
In [6]:
# each has its own data type
print("column types:\n", df2.dtypes)
column types:
 weight          float64
color            object
availability       bool
price           float64
dtype: object
In [7]:
fname = "things.csv"

# save the data frame as CSV file (Comma Separated Values )
# csv file will also contain header information (column labels)
df2.to_csv(fname)
In [8]:
# Pandas function to load csv-data into DataFrame
# (Detects column names automatically)
df2_new = pd.read_csv(fname)

display(df2_new)  # Jupyter-Notebook-specific
Unnamed: 0 weight color availability price
0 A107 10.1 red False 8.99
1 A108 5.0 green True 8.99
2 A109 8.3 blue True 8.99
3 A110 7.2 transparent False 8.99
In [9]:
# access individual values (by verbose index and column):
print(df2.loc["A108", "weight"])
df2.loc["A108", "weight"] = 3.4  # set new value
df2.loc["A108", "weight"] += 2  # increase by two

df2
5.0
Out[9]:
weight color availability price
A107 10.1 red False 8.99
A108 5.4 green True 8.99
A109 8.3 blue True 8.99
A110 7.2 transparent False 8.99
In [10]:
# access by numerical indices
print(df2.iloc[1, 0])  # row index: 1, column index: 0
5.4
In [12]:
# use slices to change multiple values
df2.loc["A108":"A109", "price"] *= 0.30  # 30% discount
df2
Out[12]:
weight color availability price
A107 10.1 red False 8.990
A108 5.4 green True 2.697
A109 8.3 blue True 2.697
A110 7.2 transparent False 8.990
In [13]:
# access multiple columns (-> new df object)
print(df2[["price", "weight"]])
      price  weight
A107  8.990    10.1
A108  2.697     5.4
A109  2.697     8.3
A110  8.990     7.2
In [14]:
# new column (-> provide a height value for every article)
df2["height"] = [10, 20, 30, 40]
df2
Out[14]:
weight color availability price height
A107 10.1 red False 8.990 10
A108 5.4 green True 2.697 20
A109 8.3 blue True 2.697 30
A110 7.2 transparent False 8.990 40
In [15]:
# new row (-> provide a value for every column (weight, color, ...) )
df2.loc["X400"] = [15 , "purple" , True , 25.00 , 50]
df2
Out[15]:
weight color availability price height
A107 10.1 red False 8.990 10
A108 5.4 green True 2.697 20
A109 8.3 blue True 2.697 30
A110 7.2 transparent False 8.990 40
X400 15.0 purple True 25.000 50
In [16]:
# create Series-object with bool-entries
idcs = df2["weight"] > 8

idcs
Out[16]:
A107     True
A108    False
A109     True
A110    False
X400     True
Name: weight, dtype: bool
In [17]:
# use this Series-object for indexing
print(df2[idcs])
      weight   color  availability   price  height
A107    10.1     red         False   8.990      10
A109     8.3    blue          True   2.697      30
X400    15.0  purple          True  25.000      50
In [ ]:
# similar statement without intermediate variable:
print(df2[df2["weight"] < 8])
In [18]:
df2.describe()
Out[18]:
weight price height
count 5.000000 5.000000 5.000000
mean 9.200000 9.674800 30.000000
std 3.664014 9.126596 15.811388
min 5.400000 2.697000 10.000000
25% 7.200000 2.697000 20.000000
50% 8.300000 8.990000 30.000000
75% 10.100000 8.990000 40.000000
max 15.000000 25.000000 50.000000
In [19]:
df2["price"].mean()
Out[19]:
9.674800000000001
In [20]:
df2["weight"].median()
Out[20]:
8.3
In [21]:
df2["weight"].max()
Out[21]:
15.0
In [24]:
print("shorthand notation (if column label is valid python name)\n")
print(df2.weight == df2["weight"], "\n")
print(all(df2.weight == df2["weight"]))
shorthand notation (if column label is valid python name)

A107    True
A108    True
A109    True
A110    True
X400    True
Name: weight, dtype: bool 

True
In [25]:
# combine function application with boolean indexing
df2[df2.weight>8].weight.mean()
Out[25]:
11.133333333333333
In [26]:
# apply an arbitrary function (here np.diff) to each (selected) column
print(df2[["price", "weight"]].apply(np.diff))
    price  weight
0  -6.293    -4.7
1   0.000     2.9
2   6.293    -1.1
3  16.010     7.8
In [28]:
import ipydex
ipydex.save_current_nb_as_html()
`pandas_demo.html` written.