Pandas 101

import pandas as pd
import numpy as np

Using Dictionary to create a dataframe

The values of the dictionary can be a tuple, list, Numpy array, or pandas series object

data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Emily"],
    "Age": [25, 32, 19, 47, 28],
    "Gender": ["Female", "Male", "Male", "Male", "Female"],
    "City": ["New York", "Paris", "London", "San Francisco", "Tokyo"],
    "Country": ["USA", "France", "UK", "USA", "Japan"]
}

row_labels = [0,1,2,3,4]

df = pd.DataFrame(data,index=row_labels)

df

	Name	Age	Gender	City	Country
0	Alice	25	Female	New York	USA
1	Bob	32	Male	Paris	France
2	Charlie	19	Male	London	UK
3	David	47	Male	San Francisco	USA
4	Emily	28	Female	Tokyo	Japan

cities = df['City']

# Accessing the single item of a series
# It is the same way as with a dictionary, by using its label as a key
cities[2]

'London'

# Accessing a single row
df.loc[3]

Name               David
Age                   47
Gender              Male
City       San Francisco
Country              USA
Name: 3, dtype: object

data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Emily"],
    "Age": np.array([25, 32, 19, 47, 28]),
    "Gender": ["Female", "Male", "Male", "Male", "Female"],
    "City": ["New York", "Paris", "London", "San Francisco", "Tokyo"],
    "Country": ["USA", "France", "UK", "USA", "Japan"],
    "Constant": 100
}

pd.DataFrame(data,index=row_labels,columns=['Constant','Name','Age','Gender','City','Country'])

	Constant	Name	Age	Gender	City	Country
0	100	Alice	25	Female	New York	USA
1	100	Bob	32	Male	Paris	France
2	100	Charlie	19	Male	London	UK
3	100	David	47	Male	San Francisco	USA
4	100	Emily	28	Female	Tokyo	Japan

Creating dataframe from list of lists

The same will work with list of tuples, numpy arrays

my_list = [
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
]

pd.DataFrame(my_list, columns=['A', 'B', 'C'])

	A	B	C
0	1	2	3
1	4	5	6
2	7	8	9

when creating a dataframe from numpy array, we can specify the parameter copy
when copy is False, dataframe is created from a numpy array in place
This is very efficient when creating large dataframes from numpy arrays
When the array changes, the dataframe will be changed

my_list = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
])

df_1 = pd.DataFrame(my_list, 
                columns=['A', 'B', 'C'],
                copy=False)

df_1

	A	B	C
0	1	2	3
1	4	5	6
2	7	8	9

my_list[0][0] = 100

df_1

	A	B	C
0	100	2	3
1	4	5	6
2	7	8	9

Retrieving Labels and Data

# Get dataframe row labels
df_1.index

RangeIndex(start=0, stop=3, step=1)

# Getting the columns of a dataframe
df_1.columns

Index(['A', 'B', 'C'], dtype='object')

# Get dataframe values without labels
# copy is only available for the 'to_numpy' method
df_1.to_numpy(copy=True)

array([[100,   2,   3],
       [  4,   5,   6],
       [  7,   8,   9]])

### Cannot use 'copy' with values
df.values

array([['Alice', 25, 'Female', 'New York', 'USA'],
       ['Bob', 32, 'Male', 'Paris', 'France'],
       ['Charlie', 19, 'Male', 'London', 'UK'],
       ['David', 47, 'Male', 'San Francisco', 'USA'],
       ['Emily', 28, 'Female', 'Tokyo', 'Japan']], dtype=object)

df.dtypes

Name       object
Age         int64
Gender     object
City       object
Country    object
dtype: object

df = df.astype(dtype={'Age':np.int32,'Country':'category'})

df.dtypes

Name         object
Age           int32
Gender       object
City         object
Country    category
dtype: object

DF shape and Size

df.ndim

df.shape

(5, 5)

df.size

# Returns memory usage in bytes
df.memory_usage()

Index      212
Name        40
Age         20
Gender      40
City        40
Country    209
dtype: int64

Accessing Data

# Get column values
df['Country']

0       USA
1    France
2        UK
3       USA
4     Japan
Name: Country, dtype: category
Categories (4, object): ['France', 'Japan', 'UK', 'USA']

# Get row values
df.loc[3]

Name               David
Age                   47
Gender              Male
City       San Francisco
Country              USA
Name: 3, dtype: object

df.loc[0:1]

	Name	Age	Gender	City	Country
0	Alice	25	Female	New York	USA
1	Bob	32	Male	Paris	France

# when accessing column using loc, we need to provide row labels as well
df.loc[:,'City']

0         New York
1            Paris
2           London
3    San Francisco
4            Tokyo
Name: City, dtype: object

# we can use iloc when using integer index
df.iloc[0]

Name          Alice
Age              25
Gender       Female
City       New York
Country         USA
Name: 0, dtype: object

# Get first two rows and first three columns
# iloc indexing starts from 0
df.iloc[0:2,0:3]

	Name	Age	Gender
0	Alice	25	Female
1	Bob	32	Male

df.iloc[0:2,[0,3]]

	Name	City
0	Alice	New York
1	Bob	Paris

For iloc the stop index is exclusive.
For loc it is inclusive.
Pandas recommends using specialized accessors .at[] and .iat[] to get single data value

df.at[0,'City']

'New York'

df.iat[2,2]

'Male'

# We can set the value using loc and iloc
df.loc[:,'Age'] = [25,32,19,47,28]

df

	Name	Age	Gender	City	Country
0	Alice	25	Female	New York	USA
1	Bob	32	Male	Paris	France
2	Charlie	19	Male	London	UK
3	David	47	Male	San Francisco	USA
4	Emily	28	Female	Tokyo	Japan

Inserting and Deleting Rows

cheeka = pd.Series(data=['Cheeka', 14,'Male','Pune','India'],
                   index=df.columns,name=5)

df = df.append(cheeka)

/tmp/ipykernel_18888/1505868640.py:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  df = df.append(cheeka)

df

	Name	Age	Gender	City	Country
0	Alice	25	Female	New York	USA
1	Bob	32	Male	Paris	France
2	Charlie	19	Male	London	UK
3	David	47	Male	San Francisco	USA
4	Emily	28	Female	Tokyo	Japan
5	Cheeka	14	Male	Pune	India

df = df.drop(labels=[5])

df

	Name	Age	Gender	City	Country
0	Alice	25	Female	New York	USA
1	Bob	32	Male	Paris	France
2	Charlie	19	Male	London	UK
3	David	47	Male	San Francisco	USA
4	Emily	28	Female	Tokyo	Japan

# Inserting a column
df['Region'] = ['North', 'South', 'East', 'West','North']

df

	Name	Age	Gender	City	Country	Region
0	Alice	25	Female	New York	USA	North
1	Bob	32	Male	Paris	France	South
2	Charlie	19	Male	London	UK	East
3	David	47	Male	San Francisco	USA	West
4	Emily	28	Female	Tokyo	Japan	North

# use insert if location of new column is required
df.insert(loc=3,column='Programmer', value = 'Python')

df

	Name	Age	Gender	Programmer	City	Country	Region
0	Alice	25	Female	Python	New York	USA	North
1	Bob	32	Male	Python	Paris	France	South
2	Charlie	19	Male	Python	London	UK	East
3	David	47	Male	Python	San Francisco	USA	West
4	Emily	28	Female	Python	Tokyo	Japan	North

Sorting Dataframe

df.sort_values(by='Age',ascending=False)

	Name	Age	Gender	Programmer	City	Country	Region
3	David	47	Male	Python	San Francisco	USA	West
1	Bob	32	Male	Python	Paris	France	South
4	Emily	28	Female	Python	Tokyo	Japan	North
0	Alice	25	Female	Python	New York	USA	North
2	Charlie	19	Male	Python	London	UK	East

Filtering Data

df[df['Age']>30]

	Name	Age	Gender	Programmer	City	Country	Region
1	Bob	32	Male	Python	Paris	France	South
3	David	47	Male	Python	San Francisco	USA	West

# Filtering using multiple conditions
df[(df['Age']>30) & (df['Gender']=='Male')]

	Name	Age	Gender	Programmer	City	Country	Region
1	Bob	32	Male	Python	Paris	France	South
3	David	47	Male	Python	San Francisco	USA	West

Getting statistics

df.describe()

	Age
count	5.000000
mean	30.200000
std	10.521407
min	19.000000
25%	25.000000
50%	28.000000
75%	32.000000
max	47.000000

df['Age'].mean(),df['Age'].std(),df['Age'].median()

(30.2, 10.521406750050108, 28.0)

Fill NA

df.at[1,'Age'] = np.nan

df

	Name	Age	Gender	Programmer	City	Country	Region
0	Alice	25.0	Female	Python	New York	USA	North
1	Bob	NaN	Male	Python	Paris	France	South
2	Charlie	19.0	Male	Python	London	UK	East
3	David	47.0	Male	Python	San Francisco	USA	West
4	Emily	28.0	Female	Python	Tokyo	Japan	North

# By default nan is not considered
df['Age'].mean()

29.75

# When nan is considered, the mean will return nan if NA values are present
df['Age'].mean(skipna=False)

nan

df.fillna(value=0)

	Name	Age	Gender	Programmer	City	Country	Region
0	Alice	25.0	Female	Python	New York	USA	North
1	Bob	0.0	Male	Python	Paris	France	South
2	Charlie	19.0	Male	Python	London	UK	East
3	David	47.0	Male	Python	San Francisco	USA	West
4	Emily	28.0	Female	Python	Tokyo	Japan	North

df.fillna(method='ffill')

	Name	Age	Gender	Programmer	City	Country	Region
0	Alice	25.0	Female	Python	New York	USA	North
1	Bob	25.0	Male	Python	Paris	France	South
2	Charlie	19.0	Male	Python	London	UK	East
3	David	47.0	Male	Python	San Francisco	USA	West
4	Emily	28.0	Female	Python	Tokyo	Japan	North

df.fillna(method='bfill')

	Name	Age	Gender	Programmer	City	Country	Region
0	Alice	25.0	Female	Python	New York	USA	North
1	Bob	19.0	Male	Python	Paris	France	South
2	Charlie	19.0	Male	Python	London	UK	East
3	David	47.0	Male	Python	San Francisco	USA	West
4	Emily	28.0	Female	Python	Tokyo	Japan	North

# This method will fill the intermediate values 
df['Age'].interpolate()

0    25.0
1    22.0
2    19.0
3    47.0
4    28.0
Name: Age, dtype: float64

df.dropna(inplace=True)

df

	Name	Age	Gender	Programmer	City	Country	Region
0	Alice	25.0	Female	Python	New York	USA	North
2	Charlie	19.0	Male	Python	London	UK	East
3	David	47.0	Male	Python	San Francisco	USA	West
4	Emily	28.0	Female	Python	Tokyo	Japan	North

Iterating over a pandas dataframe

We should avoid iterating over rows in a Dataframe
We should use vectorized operations wherever feasible
use .index and .columns to iterate over rows and columns
use items() to iterate over columns
use iteritems() to iterate over columns #Will be depricated in the future
use iterrows() to iterate over rows
use itertuples() to iterate over rows and get named tuples

# items() and iteritems() will return a tuple
# The first item will be the row label
# The second item will be the value of the column
for col_label, col in df.items():
    print(col_label,col,sep='\n',end='\n\n')

Name
0      Alice
2    Charlie
3      David
4      Emily
Name: Name, dtype: object

Age
0    25.0
2    19.0
3    47.0
4    28.0
Name: Age, dtype: float64

Gender
0    Female
2      Male
3      Male
4    Female
Name: Gender, dtype: object

Programmer
0    Python
2    Python
3    Python
4    Python
Name: Programmer, dtype: object

City
0         New York
2           London
3    San Francisco
4            Tokyo
Name: City, dtype: object

Country
0      USA
2       UK
3      USA
4    Japan
Name: Country, dtype: object

Region
0    North
2     East
3     West
4    North
Name: Region, dtype: object

for row_label, row in df.iterrows():
    print(row_label,row,sep='\n',end='\n\n')

0
Name             Alice
Age               25.0
Gender          Female
Programmer      Python
City          New York
Country            USA
Region           North
Name: 0, dtype: object

2
Name          Charlie
Age              19.0
Gender           Male
Programmer     Python
City           London
Country            UK
Region           East
Name: 2, dtype: object

3
Name                  David
Age                    47.0
Gender                 Male
Programmer           Python
City          San Francisco
Country                 USA
Region                 West
Name: 3, dtype: object

4
Name           Emily
Age             28.0
Gender        Female
Programmer    Python
City           Tokyo
Country        Japan
Region         North
Name: 4, dtype: object

## Accessing specific row values
## row will be a series object
## we can use square bracket indexing to get the value we are interested
for _, row in df.iterrows():
    print(row['Name'], row['Age'])

Alice 25.0
Charlie 19.0
David 47.0
Emily 28.0

for row in df.itertuples():
    print(row)

Pandas(Index=0, Name='Alice', Age=25.0, Gender='Female', Programmer='Python', City='New York', Country='USA', Region='North')
Pandas(Index=2, Name='Charlie', Age=19.0, Gender='Male', Programmer='Python', City='London', Country='UK', Region='East')
Pandas(Index=3, Name='David', Age=47.0, Gender='Male', Programmer='Python', City='San Francisco', Country='USA', Region='West')
Pandas(Index=4, Name='Emily', Age=28.0, Gender='Female', Programmer='Python', City='Tokyo', Country='Japan', Region='North')

# Accessing the required values in a row
for row in df.itertuples():
    print(row.Name, row.Age)

Alice 25.0
Charlie 19.0
David 47.0
Emily 28.0