Pandas Cheat Sheet


import pandas as pd
import numpy as np
import timeit

population_dict = {
    'California': 38332521,
    'Texas': 26448193,
    'New York': 19651127,
    'Florida': 19552860,
    'Illinois': 12882135
    }
population = pd.Series(population_dict)
print(population)
print("Pupulation of California:",population['California'])
print(population['California':'New York'])

print(pd.Series([2, 4, 6]))

print(pd.Series(5, index=[100, 200, 300]))

print(pd.Series([5,6,8], index=[100, 200, 300]))

print(pd.Series({2:'a', 1:'b', 3:'c'}))

print(pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2]))

area_dict = {
     
    'Texas': 695662,
    'New York': 141297,
    'Florida': 170312,
    'Illinois': 149995,
    'California': 423967
    }

area = pd.Series(area_dict)
print(area)

states = pd.DataFrame({'Population': population,'Area': area})

print(states)

print("Index of states DataFrame:",list(states.index))

print("Columns of states Datframe:",list(states.columns))

print(states['Area'])

print(pd.DataFrame(population, columns=['Population']))

data = [{'a': i, 'b': 2 * i} for i in range(1,4)]
print(pd.DataFrame(data))

print(pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}]))



print(pd.DataFrame(np.random.rand(3, 2), columns=['Column-1', 'Column-2'], index=['a', 'b','c']))

A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
print(A)

print(pd.DataFrame(A))

print(states.head()) #by default head function get top 5 records from Dataframe
print(states.head(2))# to get specific number of records from top
print(states.tail()) #by default tail function get bottom 5 records from Dataframe
print(states.tail(2))# to get specific number of records from bottom

ind = pd.Index([2, 3, 5, 7, 11])
print(ind)

print(ind[1])

print(ind[::2])

print(ind.size, ind.shape, ind.ndim, ind.dtype)
# print(ind[1]= 0) TypeError: Index does not support mutable operations

indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])

print(indA.intersection(indB))
print(indA.union(indB))
print(indA.symmetric_difference(indB))

data = pd.Series([0.25, 0.5, 0.75, 1.0],index=['a', 'b', 'c', 'd'])
print(data)
print(data['b'])

#Check explicit index
print('a' in data)
#get keys of Series Object
print(data.keys())
print(data.values)
print(data.items())
print(list(data.items()))
data['e'] = 1.25
print(data)

# slicing by explicit index(user defined index)
print(data['a':'c'])

# slicing by implicit integer index
print(data[0:2])

# masking
print(data[(data > 0.3) & (data < 0.8)])
# fancy indexing
print(data[['a', 'e']])

#Indexers: loc, iloc, and ix
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
print(data)

data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
# explicit index when indexing
print(data[1]) #
# implicit index when slicing
print(data[1:3]) #start is inclusive range but end is exclusive range

print(data.loc[1]) #explicit
print(data.loc[1:3]) #explicit inclusive start and end

#The iloc attribute allows indexing and slicing
# that always references the implicit
#Python-style index:

print(data.iloc[1])

print(data.iloc[1:3])


area = pd.Series({'California': 423967, 'Texas': 695662,
 'New York': 141297, 'Florida': 170312,
 'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
 'New York': 19651127, 'Florida': 19552860,
 'Illinois': 12882135})

data = pd.DataFrame({'area':area, 'pop':pop})

print(data)

print(data['area']) #prefer way to get column
print(data.area)

print(data.area is data['area'])
print(data.pop is data['pop'])


print(type(data.pop),type(data['pop']))

data['density'] = data['pop'] / data['area']
print(data)

print(data.values)


print(data.T) #we can transpose the full DataFrame to swap rows and columns

print(data.values[0])

print(data['area'])

print(data.iloc[:3, :2])

print(data.loc[:'Florida', :'pop'])

print(data.loc[data.density > 100, ['pop', 'density']])

print(data)

data.iloc[0, 2] = 90

print(data)


print(data['Florida':'Illinois']) #explicit index

print(data[1:3]) # implicit index

print(data[data.density > 100])

#Ufuncs: Index Preservation

rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))

print(ser)

df = pd.DataFrame(rng.randint(0, 10, (3, 4)),
                 columns=['A', 'B', 'C', 'D'])

print(df)

print(np.exp(ser))

print(np.sin(df * np.pi / 4))

#UFuncs: Index Alignment

area = pd.Series({'Alaska': 1723337, 'Texas': 695662, 'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193, 'New York': 19651127}, name='population')

print(population / area)

print(area.index.union(population.index))

A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
print(A + B)


print(A.add(B, fill_value=0))

#Index alignment in DataFrame
A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
 columns=list('AB'))

print(A)

B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
 columns=list('BAC'))
print(B)

print(A+B)

mean = A.stack().mean() #4.5

print(A.add(B, fill_value=mean))

"""
Python operator Pandas method(s)
+               add()
-               sub(), subtract()
*               mul(), multiply()
/               truediv(), div(), divide()
//              floordiv()
%               mod()
**              pow()
"""

#Ufuncs: Operations Between DataFrame and Series

A = rng.randint(10, size=(3, 4))
print(A)

print(A - A[0])

df = pd.DataFrame(A, columns=list('QRST'))
print(df)
print(df - df.iloc[0])

print(df.subtract(df['R'], axis=0))

halfrow = df.iloc[0, ::2]
print(halfrow)

print(df - halfrow)

#handling missing Data
#Missing Data in Pandas
vals1 = np.array([1, None, 3, 4])
print(vals1)
print(vals1.dtype)

for dtype in ['object', 'int']:
 print("dtype =", dtype)
# %timeit is an IPython magic command; in standard Python, use timeit module
stmt = f"np.arange(1_000_000, dtype='{dtype}').sum()"
 setup = "import numpy as np"
 t = timeit.timeit(stmt=stmt, setup=setup, number=10)
 print(f"Time for 10 runs: {t:.5f} seconds")
 print()

#print(vals1.sum()) TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

vals2 = np.array([1, np.nan, 3, 4])
print(vals2.dtype) #float64
print(vals2.sum()) #nan

print(1 + np.nan) #nan
print(0 * np.nan) #nan

print(vals2.sum(), vals2.min(), vals2.max()) #nan nan nan

print(np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)) #8.0 1.0 4.0

#NaN and None in Pandas

s1=pd.Series([1, np.nan, 2, None])
print(s1)

x = pd.Series(range(2), dtype=int)
print(x)

x[0] = None

print(x)
"""Pandas handling of NAs by type

Typeclass Conversion when storing NAs    NA sentinel value
floating   No change                     np.nan
object     No change                     None or np.nan
integer    Cast to float64               np.nan
boolean    Cast to object                None or np.nan
"""
#Operating on Null Values

'''
isnull(): Generate a Boolean mask indicating missing values
notnull(): Opposite of isnull()
dropna(): Return a filtered version of the data
fillna(): Return a copy of the data with missing values filled or imputed
'''

#Detecting null values
data = pd.Series([1, np.nan, 'hello', None])
print(data.isnull()) #getting masking value for missing values
print(data[data.notnull()]) #getting non null values

#Dropping null values
data2=data.dropna()
print(data2)

#dropna with DataFrame
df = pd.DataFrame([[1, np.nan, 2],
 [2, 3, 5],
 [np.nan, 4, 6]])

print(df)

#By default, dropna() will drop all rows in which any null value is present:
print("-----------")
print(df.dropna())

#Alternatively, you can drop NA values along a different axis; axis=1 drops all columns containing a null value:
print(df.dropna(axis='columns')) # or df.dropna(axis=1)

df[3] = np.nan
print(df)


print(df.dropna(axis='columns', how='all'))

print(df.dropna(axis='rows', thresh=3))
#the thresh parameter lets you specify a minimum number of non-null values for the row/column to be kept

#Filling null values
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
print(data)

#We can fill NA entries with a single value, such as zero:
print(data.fillna(0))

#We can specify a forward-fill to propagate the previous value forward:
print(data.ffill())

#Or we can specify a back-fill to propagate the next values backward:
print(data.bfill())

#For DataFrames, the options are similar, but we can also specify an axis along which the fills take place:
print(df)
print(df.ffill(axis=1))
print(df.ffill(axis=0))



#Hierarchical Indexing

#A Multiply Indexed Series

#The bad way
index = [('California', 2000), ('California', 2010),
 ('New York', 2000), ('New York', 2010),
 ('Texas', 2000), ('Texas', 2010)]

populations = [33871648, 37253956,
 18976457, 19378102,
 20851820, 25145561]
pop = pd.Series(populations, index=index)
print(pop)

print(pop[('California', 2010):('Texas', 2000)])

print(pop[[i for i in pop.index if i[1] == 2010]])

#The better way: Pandas MultiIndex
index = pd.MultiIndex.from_tuples(index)
print(index)

pop = pop.reindex(index)
print(pop)


print(pop[:, 2010])


pop_df=pop.unstack()
print(pop_df)

print(pop_df.stack())

pop_df = pd.DataFrame({'total': pop,
 'under18': [9267089, 9284094,
 4687374, 4318033,
 5906301, 6879014]})

print(pop_df)

f_u18 = pop_df['under18'] / pop_df['total']
print(f_u18.unstack())

#Methods of MultiIndex Creation






Post a Comment

Previous Post Next Post