import pandas as pd
import numpy as np
Pandas 101
Using Dictionary to create a dataframe
- The values of the dictionary can be a tuple, list, Numpy array, or pandas series object
= {
data "Name": ["Alice", "Bob", "Charlie", "David", "Emily"],
"Age": [25, 32, 19, 47, 28],
"Gender": ["Female", "Male", "Male", "Male", "Female"],
"City": ["New York", "Paris", "London", "San Francisco", "Tokyo"],
"Country": ["USA", "France", "UK", "USA", "Japan"]
}
= [0,1,2,3,4] row_labels
= pd.DataFrame(data,index=row_labels) df
df
Name | Age | Gender | City | Country | |
---|---|---|---|---|---|
0 | Alice | 25 | Female | New York | USA |
1 | Bob | 32 | Male | Paris | France |
2 | Charlie | 19 | Male | London | UK |
3 | David | 47 | Male | San Francisco | USA |
4 | Emily | 28 | Female | Tokyo | Japan |
= df['City'] cities
# Accessing the single item of a series
# It is the same way as with a dictionary, by using its label as a key
2] cities[
'London'
# Accessing a single row
3] df.loc[
Name David
Age 47
Gender Male
City San Francisco
Country USA
Name: 3, dtype: object
= {
data "Name": ["Alice", "Bob", "Charlie", "David", "Emily"],
"Age": np.array([25, 32, 19, 47, 28]),
"Gender": ["Female", "Male", "Male", "Male", "Female"],
"City": ["New York", "Paris", "London", "San Francisco", "Tokyo"],
"Country": ["USA", "France", "UK", "USA", "Japan"],
"Constant": 100
}
=row_labels,columns=['Constant','Name','Age','Gender','City','Country']) pd.DataFrame(data,index
Constant | Name | Age | Gender | City | Country | |
---|---|---|---|---|---|---|
0 | 100 | Alice | 25 | Female | New York | USA |
1 | 100 | Bob | 32 | Male | Paris | France |
2 | 100 | Charlie | 19 | Male | London | UK |
3 | 100 | David | 47 | Male | San Francisco | USA |
4 | 100 | Emily | 28 | Female | Tokyo | Japan |
Creating dataframe from list of lists
- The same will work with list of tuples, numpy arrays
= [
my_list 1, 2, 3],
[4, 5, 6],
[7, 8, 9]
[ ]
=['A', 'B', 'C']) pd.DataFrame(my_list, columns
A | B | C | |
---|---|---|---|
0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 |
2 | 7 | 8 | 9 |
- when creating a dataframe from numpy array, we can specify the parameter
copy
- when
copy
is False, dataframe is created from a numpy array in place - This is very efficient when creating large dataframes from numpy arrays
- When the array changes, the dataframe will be changed
= np.array([
my_list 1, 2, 3],
[4, 5, 6],
[7, 8, 9]
[ ])
= pd.DataFrame(my_list,
df_1 =['A', 'B', 'C'],
columns=False) copy
df_1
A | B | C | |
---|---|---|---|
0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 |
2 | 7 | 8 | 9 |
0][0] = 100 my_list[
df_1
A | B | C | |
---|---|---|---|
0 | 100 | 2 | 3 |
1 | 4 | 5 | 6 |
2 | 7 | 8 | 9 |
Retrieving Labels and Data
# Get dataframe row labels
df_1.index
RangeIndex(start=0, stop=3, step=1)
# Getting the columns of a dataframe
df_1.columns
Index(['A', 'B', 'C'], dtype='object')
# Get dataframe values without labels
# copy is only available for the 'to_numpy' method
=True) df_1.to_numpy(copy
array([[100, 2, 3],
[ 4, 5, 6],
[ 7, 8, 9]])
### Cannot use 'copy' with values
df.values
array([['Alice', 25, 'Female', 'New York', 'USA'],
['Bob', 32, 'Male', 'Paris', 'France'],
['Charlie', 19, 'Male', 'London', 'UK'],
['David', 47, 'Male', 'San Francisco', 'USA'],
['Emily', 28, 'Female', 'Tokyo', 'Japan']], dtype=object)
df.dtypes
Name object
Age int64
Gender object
City object
Country object
dtype: object
= df.astype(dtype={'Age':np.int32,'Country':'category'}) df
df.dtypes
Name object
Age int32
Gender object
City object
Country category
dtype: object
DF shape and Size
df.ndim
2
df.shape
(5, 5)
df.size
25
# Returns memory usage in bytes
df.memory_usage()
Index 212
Name 40
Age 20
Gender 40
City 40
Country 209
dtype: int64
Accessing Data
# Get column values
'Country'] df[
0 USA
1 France
2 UK
3 USA
4 Japan
Name: Country, dtype: category
Categories (4, object): ['France', 'Japan', 'UK', 'USA']
# Get row values
3] df.loc[
Name David
Age 47
Gender Male
City San Francisco
Country USA
Name: 3, dtype: object
0:1] df.loc[
Name | Age | Gender | City | Country | |
---|---|---|---|---|---|
0 | Alice | 25 | Female | New York | USA |
1 | Bob | 32 | Male | Paris | France |
# when accessing column using loc, we need to provide row labels as well
'City'] df.loc[:,
0 New York
1 Paris
2 London
3 San Francisco
4 Tokyo
Name: City, dtype: object
# we can use iloc when using integer index
0] df.iloc[
Name Alice
Age 25
Gender Female
City New York
Country USA
Name: 0, dtype: object
# Get first two rows and first three columns
# iloc indexing starts from 0
0:2,0:3] df.iloc[
Name | Age | Gender | |
---|---|---|---|
0 | Alice | 25 | Female |
1 | Bob | 32 | Male |
0:2,[0,3]] df.iloc[
Name | City | |
---|---|---|
0 | Alice | New York |
1 | Bob | Paris |
For iloc the stop index is exclusive.
For loc it is inclusive.
Pandas recommends using specialized accessors .at[] and .iat[] to get single data value
0,'City'] df.at[
'New York'
2,2] df.iat[
'Male'
# We can set the value using loc and iloc
'Age'] = [25,32,19,47,28] df.loc[:,
df
Name | Age | Gender | City | Country | |
---|---|---|---|---|---|
0 | Alice | 25 | Female | New York | USA |
1 | Bob | 32 | Male | Paris | France |
2 | Charlie | 19 | Male | London | UK |
3 | David | 47 | Male | San Francisco | USA |
4 | Emily | 28 | Female | Tokyo | Japan |
Inserting and Deleting Rows
= pd.Series(data=['Cheeka', 14,'Male','Pune','India'],
cheeka =df.columns,name=5) index
= df.append(cheeka) df
/tmp/ipykernel_18888/1505868640.py:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
df = df.append(cheeka)
df
Name | Age | Gender | City | Country | |
---|---|---|---|---|---|
0 | Alice | 25 | Female | New York | USA |
1 | Bob | 32 | Male | Paris | France |
2 | Charlie | 19 | Male | London | UK |
3 | David | 47 | Male | San Francisco | USA |
4 | Emily | 28 | Female | Tokyo | Japan |
5 | Cheeka | 14 | Male | Pune | India |
= df.drop(labels=[5]) df
df
Name | Age | Gender | City | Country | |
---|---|---|---|---|---|
0 | Alice | 25 | Female | New York | USA |
1 | Bob | 32 | Male | Paris | France |
2 | Charlie | 19 | Male | London | UK |
3 | David | 47 | Male | San Francisco | USA |
4 | Emily | 28 | Female | Tokyo | Japan |
# Inserting a column
'Region'] = ['North', 'South', 'East', 'West','North'] df[
df
Name | Age | Gender | City | Country | Region | |
---|---|---|---|---|---|---|
0 | Alice | 25 | Female | New York | USA | North |
1 | Bob | 32 | Male | Paris | France | South |
2 | Charlie | 19 | Male | London | UK | East |
3 | David | 47 | Male | San Francisco | USA | West |
4 | Emily | 28 | Female | Tokyo | Japan | North |
# use insert if location of new column is required
=3,column='Programmer', value = 'Python') df.insert(loc
df
Name | Age | Gender | Programmer | City | Country | Region | |
---|---|---|---|---|---|---|---|
0 | Alice | 25 | Female | Python | New York | USA | North |
1 | Bob | 32 | Male | Python | Paris | France | South |
2 | Charlie | 19 | Male | Python | London | UK | East |
3 | David | 47 | Male | Python | San Francisco | USA | West |
4 | Emily | 28 | Female | Python | Tokyo | Japan | North |
Sorting Dataframe
='Age',ascending=False) df.sort_values(by
Name | Age | Gender | Programmer | City | Country | Region | |
---|---|---|---|---|---|---|---|
3 | David | 47 | Male | Python | San Francisco | USA | West |
1 | Bob | 32 | Male | Python | Paris | France | South |
4 | Emily | 28 | Female | Python | Tokyo | Japan | North |
0 | Alice | 25 | Female | Python | New York | USA | North |
2 | Charlie | 19 | Male | Python | London | UK | East |
Filtering Data
'Age']>30] df[df[
Name | Age | Gender | Programmer | City | Country | Region | |
---|---|---|---|---|---|---|---|
1 | Bob | 32 | Male | Python | Paris | France | South |
3 | David | 47 | Male | Python | San Francisco | USA | West |
# Filtering using multiple conditions
'Age']>30) & (df['Gender']=='Male')] df[(df[
Name | Age | Gender | Programmer | City | Country | Region | |
---|---|---|---|---|---|---|---|
1 | Bob | 32 | Male | Python | Paris | France | South |
3 | David | 47 | Male | Python | San Francisco | USA | West |
Getting statistics
df.describe()
Age | |
---|---|
count | 5.000000 |
mean | 30.200000 |
std | 10.521407 |
min | 19.000000 |
25% | 25.000000 |
50% | 28.000000 |
75% | 32.000000 |
max | 47.000000 |
'Age'].mean(),df['Age'].std(),df['Age'].median() df[
(30.2, 10.521406750050108, 28.0)
Fill NA
1,'Age'] = np.nan df.at[
df
Name | Age | Gender | Programmer | City | Country | Region | |
---|---|---|---|---|---|---|---|
0 | Alice | 25.0 | Female | Python | New York | USA | North |
1 | Bob | NaN | Male | Python | Paris | France | South |
2 | Charlie | 19.0 | Male | Python | London | UK | East |
3 | David | 47.0 | Male | Python | San Francisco | USA | West |
4 | Emily | 28.0 | Female | Python | Tokyo | Japan | North |
# By default nan is not considered
'Age'].mean() df[
29.75
# When nan is considered, the mean will return nan if NA values are present
'Age'].mean(skipna=False) df[
nan
=0) df.fillna(value
Name | Age | Gender | Programmer | City | Country | Region | |
---|---|---|---|---|---|---|---|
0 | Alice | 25.0 | Female | Python | New York | USA | North |
1 | Bob | 0.0 | Male | Python | Paris | France | South |
2 | Charlie | 19.0 | Male | Python | London | UK | East |
3 | David | 47.0 | Male | Python | San Francisco | USA | West |
4 | Emily | 28.0 | Female | Python | Tokyo | Japan | North |
='ffill') df.fillna(method
Name | Age | Gender | Programmer | City | Country | Region | |
---|---|---|---|---|---|---|---|
0 | Alice | 25.0 | Female | Python | New York | USA | North |
1 | Bob | 25.0 | Male | Python | Paris | France | South |
2 | Charlie | 19.0 | Male | Python | London | UK | East |
3 | David | 47.0 | Male | Python | San Francisco | USA | West |
4 | Emily | 28.0 | Female | Python | Tokyo | Japan | North |
='bfill') df.fillna(method
Name | Age | Gender | Programmer | City | Country | Region | |
---|---|---|---|---|---|---|---|
0 | Alice | 25.0 | Female | Python | New York | USA | North |
1 | Bob | 19.0 | Male | Python | Paris | France | South |
2 | Charlie | 19.0 | Male | Python | London | UK | East |
3 | David | 47.0 | Male | Python | San Francisco | USA | West |
4 | Emily | 28.0 | Female | Python | Tokyo | Japan | North |
# This method will fill the intermediate values
'Age'].interpolate() df[
0 25.0
1 22.0
2 19.0
3 47.0
4 28.0
Name: Age, dtype: float64
=True) df.dropna(inplace
df
Name | Age | Gender | Programmer | City | Country | Region | |
---|---|---|---|---|---|---|---|
0 | Alice | 25.0 | Female | Python | New York | USA | North |
2 | Charlie | 19.0 | Male | Python | London | UK | East |
3 | David | 47.0 | Male | Python | San Francisco | USA | West |
4 | Emily | 28.0 | Female | Python | Tokyo | Japan | North |
Iterating over a pandas dataframe
We should avoid iterating over rows in a Dataframe
We should use vectorized operations wherever feasible
- use .index and .columns to iterate over rows and columns
- use items() to iterate over columns
- use iteritems() to iterate over columns #Will be depricated in the future
- use iterrows() to iterate over rows
- use itertuples() to iterate over rows and get named tuples
# items() and iteritems() will return a tuple
# The first item will be the row label
# The second item will be the value of the column
for col_label, col in df.items():
print(col_label,col,sep='\n',end='\n\n')
Name
0 Alice
2 Charlie
3 David
4 Emily
Name: Name, dtype: object
Age
0 25.0
2 19.0
3 47.0
4 28.0
Name: Age, dtype: float64
Gender
0 Female
2 Male
3 Male
4 Female
Name: Gender, dtype: object
Programmer
0 Python
2 Python
3 Python
4 Python
Name: Programmer, dtype: object
City
0 New York
2 London
3 San Francisco
4 Tokyo
Name: City, dtype: object
Country
0 USA
2 UK
3 USA
4 Japan
Name: Country, dtype: object
Region
0 North
2 East
3 West
4 North
Name: Region, dtype: object
for row_label, row in df.iterrows():
print(row_label,row,sep='\n',end='\n\n')
0
Name Alice
Age 25.0
Gender Female
Programmer Python
City New York
Country USA
Region North
Name: 0, dtype: object
2
Name Charlie
Age 19.0
Gender Male
Programmer Python
City London
Country UK
Region East
Name: 2, dtype: object
3
Name David
Age 47.0
Gender Male
Programmer Python
City San Francisco
Country USA
Region West
Name: 3, dtype: object
4
Name Emily
Age 28.0
Gender Female
Programmer Python
City Tokyo
Country Japan
Region North
Name: 4, dtype: object
## Accessing specific row values
## row will be a series object
## we can use square bracket indexing to get the value we are interested
for _, row in df.iterrows():
print(row['Name'], row['Age'])
Alice 25.0
Charlie 19.0
David 47.0
Emily 28.0
for row in df.itertuples():
print(row)
Pandas(Index=0, Name='Alice', Age=25.0, Gender='Female', Programmer='Python', City='New York', Country='USA', Region='North')
Pandas(Index=2, Name='Charlie', Age=19.0, Gender='Male', Programmer='Python', City='London', Country='UK', Region='East')
Pandas(Index=3, Name='David', Age=47.0, Gender='Male', Programmer='Python', City='San Francisco', Country='USA', Region='West')
Pandas(Index=4, Name='Emily', Age=28.0, Gender='Female', Programmer='Python', City='Tokyo', Country='Japan', Region='North')
# Accessing the required values in a row
for row in df.itertuples():
print(row.Name, row.Age)
Alice 25.0
Charlie 19.0
David 47.0
Emily 28.0