Pandas 101

import pandas as pd
import numpy as np

Using Dictionary to create a dataframe

  • The values of the dictionary can be a tuple, list, Numpy array, or pandas series object
data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Emily"],
    "Age": [25, 32, 19, 47, 28],
    "Gender": ["Female", "Male", "Male", "Male", "Female"],
    "City": ["New York", "Paris", "London", "San Francisco", "Tokyo"],
    "Country": ["USA", "France", "UK", "USA", "Japan"]
}

row_labels = [0,1,2,3,4]
df = pd.DataFrame(data,index=row_labels)
df
Name Age Gender City Country
0 Alice 25 Female New York USA
1 Bob 32 Male Paris France
2 Charlie 19 Male London UK
3 David 47 Male San Francisco USA
4 Emily 28 Female Tokyo Japan
cities = df['City']
# Accessing the single item of a series
# It is the same way as with a dictionary, by using its label as a key
cities[2]
'London'
# Accessing a single row
df.loc[3]
Name               David
Age                   47
Gender              Male
City       San Francisco
Country              USA
Name: 3, dtype: object
data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Emily"],
    "Age": np.array([25, 32, 19, 47, 28]),
    "Gender": ["Female", "Male", "Male", "Male", "Female"],
    "City": ["New York", "Paris", "London", "San Francisco", "Tokyo"],
    "Country": ["USA", "France", "UK", "USA", "Japan"],
    "Constant": 100
}
pd.DataFrame(data,index=row_labels,columns=['Constant','Name','Age','Gender','City','Country'])
Constant Name Age Gender City Country
0 100 Alice 25 Female New York USA
1 100 Bob 32 Male Paris France
2 100 Charlie 19 Male London UK
3 100 David 47 Male San Francisco USA
4 100 Emily 28 Female Tokyo Japan

Creating dataframe from list of lists

  • The same will work with list of tuples, numpy arrays
my_list = [
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
]
pd.DataFrame(my_list, columns=['A', 'B', 'C'])
A B C
0 1 2 3
1 4 5 6
2 7 8 9
  • when creating a dataframe from numpy array, we can specify the parameter copy
  • when copy is False, dataframe is created from a numpy array in place
  • This is very efficient when creating large dataframes from numpy arrays
  • When the array changes, the dataframe will be changed
my_list = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
])
df_1 = pd.DataFrame(my_list, 
                columns=['A', 'B', 'C'],
                copy=False)
df_1
A B C
0 1 2 3
1 4 5 6
2 7 8 9
my_list[0][0] = 100
df_1
A B C
0 100 2 3
1 4 5 6
2 7 8 9

Retrieving Labels and Data

# Get dataframe row labels
df_1.index
RangeIndex(start=0, stop=3, step=1)
# Getting the columns of a dataframe
df_1.columns
Index(['A', 'B', 'C'], dtype='object')
# Get dataframe values without labels
# copy is only available for the 'to_numpy' method
df_1.to_numpy(copy=True)
array([[100,   2,   3],
       [  4,   5,   6],
       [  7,   8,   9]])
### Cannot use 'copy' with values
df.values
array([['Alice', 25, 'Female', 'New York', 'USA'],
       ['Bob', 32, 'Male', 'Paris', 'France'],
       ['Charlie', 19, 'Male', 'London', 'UK'],
       ['David', 47, 'Male', 'San Francisco', 'USA'],
       ['Emily', 28, 'Female', 'Tokyo', 'Japan']], dtype=object)
df.dtypes
Name       object
Age         int64
Gender     object
City       object
Country    object
dtype: object
df = df.astype(dtype={'Age':np.int32,'Country':'category'})
df.dtypes
Name         object
Age           int32
Gender       object
City         object
Country    category
dtype: object

DF shape and Size

df.ndim
2
df.shape
(5, 5)
df.size
25
# Returns memory usage in bytes
df.memory_usage()
Index      212
Name        40
Age         20
Gender      40
City        40
Country    209
dtype: int64

Accessing Data

# Get column values
df['Country']
0       USA
1    France
2        UK
3       USA
4     Japan
Name: Country, dtype: category
Categories (4, object): ['France', 'Japan', 'UK', 'USA']
# Get row values
df.loc[3]
Name               David
Age                   47
Gender              Male
City       San Francisco
Country              USA
Name: 3, dtype: object
df.loc[0:1]
Name Age Gender City Country
0 Alice 25 Female New York USA
1 Bob 32 Male Paris France
# when accessing column using loc, we need to provide row labels as well
df.loc[:,'City']
0         New York
1            Paris
2           London
3    San Francisco
4            Tokyo
Name: City, dtype: object
# we can use iloc when using integer index
df.iloc[0]
Name          Alice
Age              25
Gender       Female
City       New York
Country         USA
Name: 0, dtype: object
# Get first two rows and first three columns
# iloc indexing starts from 0
df.iloc[0:2,0:3]
Name Age Gender
0 Alice 25 Female
1 Bob 32 Male
df.iloc[0:2,[0,3]]
Name City
0 Alice New York
1 Bob Paris
  • For iloc the stop index is exclusive.

  • For loc it is inclusive.

  • Pandas recommends using specialized accessors .at[] and .iat[] to get single data value

df.at[0,'City']
'New York'
df.iat[2,2]
'Male'
# We can set the value using loc and iloc
df.loc[:,'Age'] = [25,32,19,47,28]
df
Name Age Gender City Country
0 Alice 25 Female New York USA
1 Bob 32 Male Paris France
2 Charlie 19 Male London UK
3 David 47 Male San Francisco USA
4 Emily 28 Female Tokyo Japan

Inserting and Deleting Rows

cheeka = pd.Series(data=['Cheeka', 14,'Male','Pune','India'],
                   index=df.columns,name=5)
df = df.append(cheeka)
/tmp/ipykernel_18888/1505868640.py:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  df = df.append(cheeka)
df
Name Age Gender City Country
0 Alice 25 Female New York USA
1 Bob 32 Male Paris France
2 Charlie 19 Male London UK
3 David 47 Male San Francisco USA
4 Emily 28 Female Tokyo Japan
5 Cheeka 14 Male Pune India
df = df.drop(labels=[5])
df
Name Age Gender City Country
0 Alice 25 Female New York USA
1 Bob 32 Male Paris France
2 Charlie 19 Male London UK
3 David 47 Male San Francisco USA
4 Emily 28 Female Tokyo Japan
# Inserting a column
df['Region'] = ['North', 'South', 'East', 'West','North']
df
Name Age Gender City Country Region
0 Alice 25 Female New York USA North
1 Bob 32 Male Paris France South
2 Charlie 19 Male London UK East
3 David 47 Male San Francisco USA West
4 Emily 28 Female Tokyo Japan North
# use insert if location of new column is required
df.insert(loc=3,column='Programmer', value = 'Python')
df
Name Age Gender Programmer City Country Region
0 Alice 25 Female Python New York USA North
1 Bob 32 Male Python Paris France South
2 Charlie 19 Male Python London UK East
3 David 47 Male Python San Francisco USA West
4 Emily 28 Female Python Tokyo Japan North

Sorting Dataframe

df.sort_values(by='Age',ascending=False)
Name Age Gender Programmer City Country Region
3 David 47 Male Python San Francisco USA West
1 Bob 32 Male Python Paris France South
4 Emily 28 Female Python Tokyo Japan North
0 Alice 25 Female Python New York USA North
2 Charlie 19 Male Python London UK East

Filtering Data

df[df['Age']>30]
Name Age Gender Programmer City Country Region
1 Bob 32 Male Python Paris France South
3 David 47 Male Python San Francisco USA West
# Filtering using multiple conditions
df[(df['Age']>30) & (df['Gender']=='Male')]
Name Age Gender Programmer City Country Region
1 Bob 32 Male Python Paris France South
3 David 47 Male Python San Francisco USA West

Getting statistics

df.describe()
Age
count 5.000000
mean 30.200000
std 10.521407
min 19.000000
25% 25.000000
50% 28.000000
75% 32.000000
max 47.000000
df['Age'].mean(),df['Age'].std(),df['Age'].median()
(30.2, 10.521406750050108, 28.0)

Fill NA

df.at[1,'Age'] = np.nan
df
Name Age Gender Programmer City Country Region
0 Alice 25.0 Female Python New York USA North
1 Bob NaN Male Python Paris France South
2 Charlie 19.0 Male Python London UK East
3 David 47.0 Male Python San Francisco USA West
4 Emily 28.0 Female Python Tokyo Japan North
# By default nan is not considered
df['Age'].mean()
29.75
# When nan is considered, the mean will return nan if NA values are present
df['Age'].mean(skipna=False)
nan
df.fillna(value=0)
Name Age Gender Programmer City Country Region
0 Alice 25.0 Female Python New York USA North
1 Bob 0.0 Male Python Paris France South
2 Charlie 19.0 Male Python London UK East
3 David 47.0 Male Python San Francisco USA West
4 Emily 28.0 Female Python Tokyo Japan North
df.fillna(method='ffill')
Name Age Gender Programmer City Country Region
0 Alice 25.0 Female Python New York USA North
1 Bob 25.0 Male Python Paris France South
2 Charlie 19.0 Male Python London UK East
3 David 47.0 Male Python San Francisco USA West
4 Emily 28.0 Female Python Tokyo Japan North
df.fillna(method='bfill')
Name Age Gender Programmer City Country Region
0 Alice 25.0 Female Python New York USA North
1 Bob 19.0 Male Python Paris France South
2 Charlie 19.0 Male Python London UK East
3 David 47.0 Male Python San Francisco USA West
4 Emily 28.0 Female Python Tokyo Japan North
# This method will fill the intermediate values 
df['Age'].interpolate()
0    25.0
1    22.0
2    19.0
3    47.0
4    28.0
Name: Age, dtype: float64
df.dropna(inplace=True)
df
Name Age Gender Programmer City Country Region
0 Alice 25.0 Female Python New York USA North
2 Charlie 19.0 Male Python London UK East
3 David 47.0 Male Python San Francisco USA West
4 Emily 28.0 Female Python Tokyo Japan North

Iterating over a pandas dataframe

  • We should avoid iterating over rows in a Dataframe
  • We should use vectorized operations wherever feasible
  • use .index and .columns to iterate over rows and columns
  • use items() to iterate over columns
  • use iteritems() to iterate over columns #Will be depricated in the future
  • use iterrows() to iterate over rows
  • use itertuples() to iterate over rows and get named tuples
# items() and iteritems() will return a tuple
# The first item will be the row label
# The second item will be the value of the column
for col_label, col in df.items():
    print(col_label,col,sep='\n',end='\n\n')
Name
0      Alice
2    Charlie
3      David
4      Emily
Name: Name, dtype: object

Age
0    25.0
2    19.0
3    47.0
4    28.0
Name: Age, dtype: float64

Gender
0    Female
2      Male
3      Male
4    Female
Name: Gender, dtype: object

Programmer
0    Python
2    Python
3    Python
4    Python
Name: Programmer, dtype: object

City
0         New York
2           London
3    San Francisco
4            Tokyo
Name: City, dtype: object

Country
0      USA
2       UK
3      USA
4    Japan
Name: Country, dtype: object

Region
0    North
2     East
3     West
4    North
Name: Region, dtype: object
for row_label, row in df.iterrows():
    print(row_label,row,sep='\n',end='\n\n')
0
Name             Alice
Age               25.0
Gender          Female
Programmer      Python
City          New York
Country            USA
Region           North
Name: 0, dtype: object

2
Name          Charlie
Age              19.0
Gender           Male
Programmer     Python
City           London
Country            UK
Region           East
Name: 2, dtype: object

3
Name                  David
Age                    47.0
Gender                 Male
Programmer           Python
City          San Francisco
Country                 USA
Region                 West
Name: 3, dtype: object

4
Name           Emily
Age             28.0
Gender        Female
Programmer    Python
City           Tokyo
Country        Japan
Region         North
Name: 4, dtype: object
## Accessing specific row values
## row will be a series object
## we can use square bracket indexing to get the value we are interested
for _, row in df.iterrows():
    print(row['Name'], row['Age'])
Alice 25.0
Charlie 19.0
David 47.0
Emily 28.0
for row in df.itertuples():
    print(row)
Pandas(Index=0, Name='Alice', Age=25.0, Gender='Female', Programmer='Python', City='New York', Country='USA', Region='North')
Pandas(Index=2, Name='Charlie', Age=19.0, Gender='Male', Programmer='Python', City='London', Country='UK', Region='East')
Pandas(Index=3, Name='David', Age=47.0, Gender='Male', Programmer='Python', City='San Francisco', Country='USA', Region='West')
Pandas(Index=4, Name='Emily', Age=28.0, Gender='Female', Programmer='Python', City='Tokyo', Country='Japan', Region='North')
# Accessing the required values in a row
for row in df.itertuples():
    print(row.Name, row.Age)
Alice 25.0
Charlie 19.0
David 47.0
Emily 28.0