import pandas as pd
import numpy as npPandas 101
Using Dictionary to create a dataframe
- The values of the dictionary can be a tuple, list, Numpy array, or pandas series object
data = {
"Name": ["Alice", "Bob", "Charlie", "David", "Emily"],
"Age": [25, 32, 19, 47, 28],
"Gender": ["Female", "Male", "Male", "Male", "Female"],
"City": ["New York", "Paris", "London", "San Francisco", "Tokyo"],
"Country": ["USA", "France", "UK", "USA", "Japan"]
}
row_labels = [0,1,2,3,4]df = pd.DataFrame(data,index=row_labels)df| Name | Age | Gender | City | Country | |
|---|---|---|---|---|---|
| 0 | Alice | 25 | Female | New York | USA |
| 1 | Bob | 32 | Male | Paris | France |
| 2 | Charlie | 19 | Male | London | UK |
| 3 | David | 47 | Male | San Francisco | USA |
| 4 | Emily | 28 | Female | Tokyo | Japan |
cities = df['City']# Accessing the single item of a series
# It is the same way as with a dictionary, by using its label as a key
cities[2]'London'
# Accessing a single row
df.loc[3]Name David
Age 47
Gender Male
City San Francisco
Country USA
Name: 3, dtype: object
data = {
"Name": ["Alice", "Bob", "Charlie", "David", "Emily"],
"Age": np.array([25, 32, 19, 47, 28]),
"Gender": ["Female", "Male", "Male", "Male", "Female"],
"City": ["New York", "Paris", "London", "San Francisco", "Tokyo"],
"Country": ["USA", "France", "UK", "USA", "Japan"],
"Constant": 100
}pd.DataFrame(data,index=row_labels,columns=['Constant','Name','Age','Gender','City','Country'])| Constant | Name | Age | Gender | City | Country | |
|---|---|---|---|---|---|---|
| 0 | 100 | Alice | 25 | Female | New York | USA |
| 1 | 100 | Bob | 32 | Male | Paris | France |
| 2 | 100 | Charlie | 19 | Male | London | UK |
| 3 | 100 | David | 47 | Male | San Francisco | USA |
| 4 | 100 | Emily | 28 | Female | Tokyo | Japan |
Creating dataframe from list of lists
- The same will work with list of tuples, numpy arrays
my_list = [
[1, 2, 3],
[4, 5, 6],
[7, 8, 9]
]pd.DataFrame(my_list, columns=['A', 'B', 'C'])| A | B | C | |
|---|---|---|---|
| 0 | 1 | 2 | 3 |
| 1 | 4 | 5 | 6 |
| 2 | 7 | 8 | 9 |
- when creating a dataframe from numpy array, we can specify the parameter
copy - when
copyis False, dataframe is created from a numpy array in place - This is very efficient when creating large dataframes from numpy arrays
- When the array changes, the dataframe will be changed
my_list = np.array([
[1, 2, 3],
[4, 5, 6],
[7, 8, 9]
])df_1 = pd.DataFrame(my_list,
columns=['A', 'B', 'C'],
copy=False)df_1| A | B | C | |
|---|---|---|---|
| 0 | 1 | 2 | 3 |
| 1 | 4 | 5 | 6 |
| 2 | 7 | 8 | 9 |
my_list[0][0] = 100df_1| A | B | C | |
|---|---|---|---|
| 0 | 100 | 2 | 3 |
| 1 | 4 | 5 | 6 |
| 2 | 7 | 8 | 9 |
Retrieving Labels and Data
# Get dataframe row labels
df_1.indexRangeIndex(start=0, stop=3, step=1)
# Getting the columns of a dataframe
df_1.columnsIndex(['A', 'B', 'C'], dtype='object')
# Get dataframe values without labels
# copy is only available for the 'to_numpy' method
df_1.to_numpy(copy=True)array([[100, 2, 3],
[ 4, 5, 6],
[ 7, 8, 9]])
### Cannot use 'copy' with values
df.valuesarray([['Alice', 25, 'Female', 'New York', 'USA'],
['Bob', 32, 'Male', 'Paris', 'France'],
['Charlie', 19, 'Male', 'London', 'UK'],
['David', 47, 'Male', 'San Francisco', 'USA'],
['Emily', 28, 'Female', 'Tokyo', 'Japan']], dtype=object)
df.dtypesName object
Age int64
Gender object
City object
Country object
dtype: object
df = df.astype(dtype={'Age':np.int32,'Country':'category'})df.dtypesName object
Age int32
Gender object
City object
Country category
dtype: object
DF shape and Size
df.ndim2
df.shape(5, 5)
df.size25
# Returns memory usage in bytes
df.memory_usage()Index 212
Name 40
Age 20
Gender 40
City 40
Country 209
dtype: int64
Accessing Data
# Get column values
df['Country']0 USA
1 France
2 UK
3 USA
4 Japan
Name: Country, dtype: category
Categories (4, object): ['France', 'Japan', 'UK', 'USA']
# Get row values
df.loc[3]Name David
Age 47
Gender Male
City San Francisco
Country USA
Name: 3, dtype: object
df.loc[0:1]| Name | Age | Gender | City | Country | |
|---|---|---|---|---|---|
| 0 | Alice | 25 | Female | New York | USA |
| 1 | Bob | 32 | Male | Paris | France |
# when accessing column using loc, we need to provide row labels as well
df.loc[:,'City']0 New York
1 Paris
2 London
3 San Francisco
4 Tokyo
Name: City, dtype: object
# we can use iloc when using integer index
df.iloc[0]Name Alice
Age 25
Gender Female
City New York
Country USA
Name: 0, dtype: object
# Get first two rows and first three columns
# iloc indexing starts from 0
df.iloc[0:2,0:3]| Name | Age | Gender | |
|---|---|---|---|
| 0 | Alice | 25 | Female |
| 1 | Bob | 32 | Male |
df.iloc[0:2,[0,3]]| Name | City | |
|---|---|---|
| 0 | Alice | New York |
| 1 | Bob | Paris |
For iloc the stop index is exclusive.
For loc it is inclusive.
Pandas recommends using specialized accessors .at[] and .iat[] to get single data value
df.at[0,'City']'New York'
df.iat[2,2]'Male'
# We can set the value using loc and iloc
df.loc[:,'Age'] = [25,32,19,47,28]df| Name | Age | Gender | City | Country | |
|---|---|---|---|---|---|
| 0 | Alice | 25 | Female | New York | USA |
| 1 | Bob | 32 | Male | Paris | France |
| 2 | Charlie | 19 | Male | London | UK |
| 3 | David | 47 | Male | San Francisco | USA |
| 4 | Emily | 28 | Female | Tokyo | Japan |
Inserting and Deleting Rows
cheeka = pd.Series(data=['Cheeka', 14,'Male','Pune','India'],
index=df.columns,name=5)df = df.append(cheeka)/tmp/ipykernel_18888/1505868640.py:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
df = df.append(cheeka)
df| Name | Age | Gender | City | Country | |
|---|---|---|---|---|---|
| 0 | Alice | 25 | Female | New York | USA |
| 1 | Bob | 32 | Male | Paris | France |
| 2 | Charlie | 19 | Male | London | UK |
| 3 | David | 47 | Male | San Francisco | USA |
| 4 | Emily | 28 | Female | Tokyo | Japan |
| 5 | Cheeka | 14 | Male | Pune | India |
df = df.drop(labels=[5])df| Name | Age | Gender | City | Country | |
|---|---|---|---|---|---|
| 0 | Alice | 25 | Female | New York | USA |
| 1 | Bob | 32 | Male | Paris | France |
| 2 | Charlie | 19 | Male | London | UK |
| 3 | David | 47 | Male | San Francisco | USA |
| 4 | Emily | 28 | Female | Tokyo | Japan |
# Inserting a column
df['Region'] = ['North', 'South', 'East', 'West','North']df| Name | Age | Gender | City | Country | Region | |
|---|---|---|---|---|---|---|
| 0 | Alice | 25 | Female | New York | USA | North |
| 1 | Bob | 32 | Male | Paris | France | South |
| 2 | Charlie | 19 | Male | London | UK | East |
| 3 | David | 47 | Male | San Francisco | USA | West |
| 4 | Emily | 28 | Female | Tokyo | Japan | North |
# use insert if location of new column is required
df.insert(loc=3,column='Programmer', value = 'Python')df| Name | Age | Gender | Programmer | City | Country | Region | |
|---|---|---|---|---|---|---|---|
| 0 | Alice | 25 | Female | Python | New York | USA | North |
| 1 | Bob | 32 | Male | Python | Paris | France | South |
| 2 | Charlie | 19 | Male | Python | London | UK | East |
| 3 | David | 47 | Male | Python | San Francisco | USA | West |
| 4 | Emily | 28 | Female | Python | Tokyo | Japan | North |
Sorting Dataframe
df.sort_values(by='Age',ascending=False)| Name | Age | Gender | Programmer | City | Country | Region | |
|---|---|---|---|---|---|---|---|
| 3 | David | 47 | Male | Python | San Francisco | USA | West |
| 1 | Bob | 32 | Male | Python | Paris | France | South |
| 4 | Emily | 28 | Female | Python | Tokyo | Japan | North |
| 0 | Alice | 25 | Female | Python | New York | USA | North |
| 2 | Charlie | 19 | Male | Python | London | UK | East |
Filtering Data
df[df['Age']>30]| Name | Age | Gender | Programmer | City | Country | Region | |
|---|---|---|---|---|---|---|---|
| 1 | Bob | 32 | Male | Python | Paris | France | South |
| 3 | David | 47 | Male | Python | San Francisco | USA | West |
# Filtering using multiple conditions
df[(df['Age']>30) & (df['Gender']=='Male')]| Name | Age | Gender | Programmer | City | Country | Region | |
|---|---|---|---|---|---|---|---|
| 1 | Bob | 32 | Male | Python | Paris | France | South |
| 3 | David | 47 | Male | Python | San Francisco | USA | West |
Getting statistics
df.describe()| Age | |
|---|---|
| count | 5.000000 |
| mean | 30.200000 |
| std | 10.521407 |
| min | 19.000000 |
| 25% | 25.000000 |
| 50% | 28.000000 |
| 75% | 32.000000 |
| max | 47.000000 |
df['Age'].mean(),df['Age'].std(),df['Age'].median()(30.2, 10.521406750050108, 28.0)
Fill NA
df.at[1,'Age'] = np.nandf| Name | Age | Gender | Programmer | City | Country | Region | |
|---|---|---|---|---|---|---|---|
| 0 | Alice | 25.0 | Female | Python | New York | USA | North |
| 1 | Bob | NaN | Male | Python | Paris | France | South |
| 2 | Charlie | 19.0 | Male | Python | London | UK | East |
| 3 | David | 47.0 | Male | Python | San Francisco | USA | West |
| 4 | Emily | 28.0 | Female | Python | Tokyo | Japan | North |
# By default nan is not considered
df['Age'].mean()29.75
# When nan is considered, the mean will return nan if NA values are present
df['Age'].mean(skipna=False)nan
df.fillna(value=0)| Name | Age | Gender | Programmer | City | Country | Region | |
|---|---|---|---|---|---|---|---|
| 0 | Alice | 25.0 | Female | Python | New York | USA | North |
| 1 | Bob | 0.0 | Male | Python | Paris | France | South |
| 2 | Charlie | 19.0 | Male | Python | London | UK | East |
| 3 | David | 47.0 | Male | Python | San Francisco | USA | West |
| 4 | Emily | 28.0 | Female | Python | Tokyo | Japan | North |
df.fillna(method='ffill')| Name | Age | Gender | Programmer | City | Country | Region | |
|---|---|---|---|---|---|---|---|
| 0 | Alice | 25.0 | Female | Python | New York | USA | North |
| 1 | Bob | 25.0 | Male | Python | Paris | France | South |
| 2 | Charlie | 19.0 | Male | Python | London | UK | East |
| 3 | David | 47.0 | Male | Python | San Francisco | USA | West |
| 4 | Emily | 28.0 | Female | Python | Tokyo | Japan | North |
df.fillna(method='bfill')| Name | Age | Gender | Programmer | City | Country | Region | |
|---|---|---|---|---|---|---|---|
| 0 | Alice | 25.0 | Female | Python | New York | USA | North |
| 1 | Bob | 19.0 | Male | Python | Paris | France | South |
| 2 | Charlie | 19.0 | Male | Python | London | UK | East |
| 3 | David | 47.0 | Male | Python | San Francisco | USA | West |
| 4 | Emily | 28.0 | Female | Python | Tokyo | Japan | North |
# This method will fill the intermediate values
df['Age'].interpolate()0 25.0
1 22.0
2 19.0
3 47.0
4 28.0
Name: Age, dtype: float64
df.dropna(inplace=True)df| Name | Age | Gender | Programmer | City | Country | Region | |
|---|---|---|---|---|---|---|---|
| 0 | Alice | 25.0 | Female | Python | New York | USA | North |
| 2 | Charlie | 19.0 | Male | Python | London | UK | East |
| 3 | David | 47.0 | Male | Python | San Francisco | USA | West |
| 4 | Emily | 28.0 | Female | Python | Tokyo | Japan | North |
Iterating over a pandas dataframe
We should avoid iterating over rows in a DataframeWe should use vectorized operations wherever feasible- use .index and .columns to iterate over rows and columns
- use items() to iterate over columns
- use iteritems() to iterate over columns #Will be depricated in the future
- use iterrows() to iterate over rows
- use itertuples() to iterate over rows and get named tuples
# items() and iteritems() will return a tuple
# The first item will be the row label
# The second item will be the value of the column
for col_label, col in df.items():
print(col_label,col,sep='\n',end='\n\n')Name
0 Alice
2 Charlie
3 David
4 Emily
Name: Name, dtype: object
Age
0 25.0
2 19.0
3 47.0
4 28.0
Name: Age, dtype: float64
Gender
0 Female
2 Male
3 Male
4 Female
Name: Gender, dtype: object
Programmer
0 Python
2 Python
3 Python
4 Python
Name: Programmer, dtype: object
City
0 New York
2 London
3 San Francisco
4 Tokyo
Name: City, dtype: object
Country
0 USA
2 UK
3 USA
4 Japan
Name: Country, dtype: object
Region
0 North
2 East
3 West
4 North
Name: Region, dtype: object
for row_label, row in df.iterrows():
print(row_label,row,sep='\n',end='\n\n')0
Name Alice
Age 25.0
Gender Female
Programmer Python
City New York
Country USA
Region North
Name: 0, dtype: object
2
Name Charlie
Age 19.0
Gender Male
Programmer Python
City London
Country UK
Region East
Name: 2, dtype: object
3
Name David
Age 47.0
Gender Male
Programmer Python
City San Francisco
Country USA
Region West
Name: 3, dtype: object
4
Name Emily
Age 28.0
Gender Female
Programmer Python
City Tokyo
Country Japan
Region North
Name: 4, dtype: object
## Accessing specific row values
## row will be a series object
## we can use square bracket indexing to get the value we are interested
for _, row in df.iterrows():
print(row['Name'], row['Age'])Alice 25.0
Charlie 19.0
David 47.0
Emily 28.0
for row in df.itertuples():
print(row)Pandas(Index=0, Name='Alice', Age=25.0, Gender='Female', Programmer='Python', City='New York', Country='USA', Region='North')
Pandas(Index=2, Name='Charlie', Age=19.0, Gender='Male', Programmer='Python', City='London', Country='UK', Region='East')
Pandas(Index=3, Name='David', Age=47.0, Gender='Male', Programmer='Python', City='San Francisco', Country='USA', Region='West')
Pandas(Index=4, Name='Emily', Age=28.0, Gender='Female', Programmer='Python', City='Tokyo', Country='Japan', Region='North')
# Accessing the required values in a row
for row in df.itertuples():
print(row.Name, row.Age)Alice 25.0
Charlie 19.0
David 47.0
Emily 28.0