import pandas as pd
import numpy as np
import timeit
population_dict = {
'California': 38332521,
'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135
}
population = pd.Series(population_dict)
print(population)
print("Pupulation of California:",population['California'])
print(population['California':'New York'])
print(pd.Series([2, 4, 6]))
print(pd.Series(5, index=[100, 200, 300]))
print(pd.Series([5,6,8], index=[100, 200, 300]))
print(pd.Series({2:'a', 1:'b', 3:'c'}))
print(pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2]))
area_dict = {
'Texas': 695662,
'New York': 141297,
'Florida': 170312,
'Illinois': 149995,
'California': 423967
}
area = pd.Series(area_dict)
print(area)
states = pd.DataFrame({'Population': population,'Area': area})
print(states)
print("Index of states DataFrame:",list(states.index))
print("Columns of states Datframe:",list(states.columns))
print(states['Area'])
print(pd.DataFrame(population, columns=['Population']))
data = [{'a': i, 'b': 2 * i} for i in range(1,4)]
print(pd.DataFrame(data))
print(pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}]))
print(pd.DataFrame(np.random.rand(3, 2),
columns=['Column-1', 'Column-2'],
index=['a', 'b','c']))
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
print(A)
print(pd.DataFrame(A))
print(states.head()) #by default head function get top 5 records from Dataframe
print(states.head(2))# to get specific number of records from top
print(states.tail()) #by default tail function get bottom 5 records from Dataframe
print(states.tail(2))# to get specific number of records from bottom
ind = pd.Index([2, 3, 5, 7, 11])
print(ind)
print(ind[1])
print(ind[::2])
print(ind.size, ind.shape, ind.ndim, ind.dtype)
# print(ind[1]= 0) TypeError: Index does not support mutable operations
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])
print(indA.intersection(indB))
print(indA.union(indB))
print(indA.symmetric_difference(indB))
data = pd.Series([0.25, 0.5, 0.75, 1.0],index=['a', 'b', 'c', 'd'])
print(data)
print(data['b'])
#Check explicit index
print('a' in data)
#get keys of Series Object
print(data.keys())
print(data.values)
print(data.items())
print(list(data.items()))
data['e'] = 1.25
print(data)
# slicing by explicit index(user defined index)
print(data['a':'c'])
# slicing by implicit integer index
print(data[0:2])
# masking
print(data[(data > 0.3) & (data < 0.8)])
# fancy indexing
print(data[['a', 'e']])
#Indexers: loc, iloc, and ix
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
print(data)
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
# explicit index when indexing
print(data[1]) #
# implicit index when slicing
print(data[1:3]) #start is inclusive range but end is exclusive range
print(data.loc[1]) #explicit
print(data.loc[1:3]) #explicit inclusive start and end
#The iloc attribute allows indexing and slicing
# that always references the implicit
#Python-style index:
print(data.iloc[1])
print(data.iloc[1:3])
area = pd.Series({'California': 423967, 'Texas': 695662,
'New York': 141297, 'Florida': 170312,
'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127, 'Florida': 19552860,
'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
print(data)
print(data['area']) #prefer way to get column
print(data.area)
print(data.area is data['area'])
print(data.pop is data['pop'])
print(type(data.pop),type(data['pop']))
data['density'] = data['pop'] / data['area']
print(data)
print(data.values)
print(data.T) #we can transpose the full DataFrame to swap rows and columns
print(data.values[0])
print(data['area'])
print(data.iloc[:3, :2])
print(data.loc[:'Florida', :'pop'])
print(data.loc[data.density > 100, ['pop', 'density']])
print(data)
data.iloc[0, 2] = 90
print(data)
print(data['Florida':'Illinois']) #explicit index
print(data[1:3]) # implicit index
print(data[data.density > 100])
#Ufuncs: Index Preservation
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
print(ser)
df = pd.DataFrame(rng.randint(0, 10, (3, 4)),
columns=['A', 'B', 'C', 'D'])
print(df)
print(np.exp(ser))
print(np.sin(df * np.pi / 4))
#UFuncs: Index Alignment
area = pd.Series({'Alaska': 1723337, 'Texas': 695662, 'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193, 'New York': 19651127}, name='population')
print(population / area)
print(area.index.union(population.index))
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
print(A + B)
print(A.add(B, fill_value=0))
#Index alignment in DataFrame
A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
columns=list('AB'))
print(A)
B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
columns=list('BAC'))
print(B)
print(A+B)
mean = A.stack().mean() #4.5
print(A.add(B, fill_value=mean))
"""
Python operator Pandas method(s)
+ add()
- sub(), subtract()
* mul(), multiply()
/ truediv(), div(), divide()
// floordiv()
% mod()
** pow()
"""
#Ufuncs: Operations Between DataFrame and Series
A = rng.randint(10, size=(3, 4))
print(A)
print(A - A[0])
df = pd.DataFrame(A, columns=list('QRST'))
print(df)
print(df - df.iloc[0])
print(df.subtract(df['R'], axis=0))
halfrow = df.iloc[0, ::2]
print(halfrow)
print(df - halfrow)
#handling missing Data
#Missing Data in Pandas
vals1 = np.array([1, None, 3, 4])
print(vals1)
print(vals1.dtype)
for dtype in ['object', 'int']:
print("dtype =", dtype)
# %timeit is an IPython magic command; in standard Python, use timeit module
stmt = f"np.arange(1_000_000, dtype='{dtype}').sum()"
setup = "import numpy as np"
t = timeit.timeit(stmt=stmt, setup=setup, number=10)
print(f"Time for 10 runs: {t:.5f} seconds")
print()
#print(vals1.sum()) TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'
vals2 = np.array([1, np.nan, 3, 4])
print(vals2.dtype) #float64
print(vals2.sum()) #nan
print(1 + np.nan) #nan
print(0 * np.nan) #nan
print(vals2.sum(), vals2.min(), vals2.max()) #nan nan nan
print(np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)) #8.0 1.0 4.0
#NaN and None in Pandas
s1=pd.Series([1, np.nan, 2, None])
print(s1)
x = pd.Series(range(2), dtype=int)
print(x)
x[0] = None
print(x)
"""Pandas handling of NAs by type
Typeclass Conversion when storing NAs NA sentinel value
floating No change np.nan
object No change None or np.nan
integer Cast to float64 np.nan
boolean Cast to object None or np.nan
"""
#Operating on Null Values
'''
isnull(): Generate a Boolean mask indicating missing values
notnull(): Opposite of isnull()
dropna(): Return a filtered version of the data
fillna(): Return a copy of the data with missing values filled or imputed
'''
#Detecting null values
data = pd.Series([1, np.nan, 'hello', None])
print(data.isnull()) #getting masking value for missing values
print(data[data.notnull()]) #getting non null values
#Dropping null values
data2=data.dropna()
print(data2)
#dropna with DataFrame
df = pd.DataFrame([[1, np.nan, 2],
[2, 3, 5],
[np.nan, 4, 6]])
print(df)
#By default, dropna() will drop all rows in which any null value is present:
print("-----------")
print(df.dropna())
#Alternatively, you can drop NA values along a different axis; axis=1 drops all columns containing a null value:
print(df.dropna(axis='columns')) # or df.dropna(axis=1)
df[3] = np.nan
print(df)
print(df.dropna(axis='columns', how='all'))
print(df.dropna(axis='rows', thresh=3))
#the thresh parameter lets you specify a minimum number of non-null values for the row/column to be kept
#Filling null values
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
print(data)
#We can fill NA entries with a single value, such as zero:
print(data.fillna(0))
#We can specify a forward-fill to propagate the previous value forward:
print(data.ffill())
#Or we can specify a back-fill to propagate the next values backward:
print(data.bfill())
#For DataFrames, the options are similar, but we can also specify an axis along which the fills take place:
print(df)
print(df.ffill(axis=1))
print(df.ffill(axis=0))
#Hierarchical Indexing
#A Multiply Indexed Series
#The bad way
index = [('California', 2000), ('California', 2010),
('New York', 2000), ('New York', 2010),
('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
18976457, 19378102,
20851820, 25145561]
pop = pd.Series(populations, index=index)
print(pop)
print(pop[('California', 2010):('Texas', 2000)])
print(pop[[i for i in pop.index if i[1] == 2010]])
#The better way: Pandas MultiIndex
index = pd.MultiIndex.from_tuples(index)
print(index)
pop = pop.reindex(index)
print(pop)
print(pop[:, 2010])
pop_df=pop.unstack()
print(pop_df)
print(pop_df.stack())
pop_df = pd.DataFrame({'total': pop,
'under18': [9267089, 9284094,
4687374, 4318033,
5906301, 6879014]})
print(pop_df)
f_u18 = pop_df['under18'] / pop_df['total']
print(f_u18.unstack())
#Methods of MultiIndex Creation