-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathpandas_usage.py
More file actions
142 lines (118 loc) · 4.55 KB
/
Copy pathpandas_usage.py
File metadata and controls
142 lines (118 loc) · 4.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/python
# Date: 2018-10-27
#
# Description:
# Basic operations in pandas library
import pandas as pd
# Create data frame, reading data from CSV file
df = pd.read_csv('weather_data.csv')
print df
# Same can be done using data in dictionary also
weather_data = {
'Day': ['1/1/2018', '1/2/2018', '1/3/2018', '1/4/2018', '1/5/2018', '1/6/2018'],
'Temperature': [32, 35, 28, 24, 32, 31],
'Windspeed': [6, 7, 2, 7, 4, 2],
'Event': ['Rain', 'Sunny', 'Snow', 'Snow', 'Rain', 'Sunny'],
}
df = pd.DataFrame(weather_data)
print df
# Day Event Temperature Windspeed
# 0 1/1/2018 Rain 32 6
# 1 1/2/2018 Sunny 35 7
# 2 1/3/2018 Snow 28 2
# 3 1/4/2018 Snow 24 7
# 4 1/5/2018 Rain 32 4
# 5 1/6/2018 Sunny 31 2
print df.shape # (6, 4)
print df.head(2) # Print first 2 rows only with column names and row index
print df.tail(2) # Print last 2 rows only with column names and row index
print df.head(-2) # Prints 0 to n - 2 rows
print df[1:3] # Same as index slicing, prints rows from 0 to 2
print df.columns # Index([u'Day', u'Event', u'Temperature', u'Windspeed'], dtype='object')
print df.Day # Print all values in Day column
# 0 1/1/2018
# 1 1/2/2018
# 2 1/3/2018
# 3 1/4/2018
# 4 1/5/2018
# 5 1/6/2018
# Name: Day, dtype: object
print df['Temperature'] # Same as above, print all values in Temperature col
# 0 32
# 1 35
# 2 28
# 3 24
# 4 32
# 5 31
# Name: Temperature, dtype: int64
print type(df['Temperature']) # <class 'pandas.core.series.Series'>
# All column in pandas data frame are of this type only
print df[['Day', 'Event']] # Print specific column values
# Day Event
# 0 1/1/2018 Rain
# 1 1/2/2018 Sunny
# 2 1/3/2018 Snow
# 3 1/4/2018 Snow
# 4 1/5/2018 Rain
# 5 1/6/2018 Sunny
## Operations on data frames
print df['Temperature'].max() # 35, Max temperature
print df['Temperature'].min() # 24, Min temperature
print df['Temperature'].sum() # 182, Sum of all temperatures
print df['Temperature'].mean() # 30.333333333333332, Average temperature
print df['Temperature'].std() # 3.8297084310253524, Standard deviation
# Gives statistics of columns having interger values like Temperature and
# Windspeed, Quick way to get table stats
print df.describe()
# Temperature Windspeed
# count 6.000000 6.000000
# mean 30.333333 4.666667
# std 3.829708 2.338090
# min 24.000000 2.000000
# 25% 28.750000 2.500000
# 50% 31.500000 5.000000
# 75% 32.000000 6.750000
# max 35.000000 7.000000
print df[df.Temperature >= 32] # Conditional selection like in relational databases
# Day Event Temperature Windspeed
# 0 1/1/2018 Rain 32 6
# 1 1/2/2018 Sunny 35 7
# 4 1/5/2018 Rain 32 4
print df[df.Temperature == df.Temperature.max()] # Day stats with max temp
# Day Event Temperature Windspeed
# 1 1/2/2018 Sunny 35 7
## Note: If column name contains spaces, then we will have dictionary syntax
# to specify column name like df['Temperature readings'], we can't use dot
# syntax in those cases.
# Day and temp of day which had max temp
print df[['Day', 'Temperature']][df.Temperature == df.Temperature.max()]
# Day Temperature
# 1 1/2/2018 35
## indexes, like range last is not included
print df.index # RangeIndex(start=0, stop=6, step=1)
# Use some column as index instead of counting from 0. set_index() retruns a
# new data frame, to update existing data frame using inplace=True
df.set_index('Day', inplace=True)
print df # Now Day is used as indexes instead of 0,1,2...
# Event Temperature Windspeed
# Day
# 1/1/2018 Rain 32 6
# 1/2/2018 Sunny 35 7
# 1/3/2018 Snow 28 2
# 1/4/2018 Snow 24 7
# 1/5/2018 Rain 32 4
# 1/6/2018 Sunny 31 2
print df.loc['1/2/2018'] # Get row whose index value is '1/2/2018'
# Event Sunny
# Temperature 35
# Windspeed 7
# Name: 1/2/2018, dtype: object
df.reset_index(inplace=True) # Reset index back to numbers
print df
# Day Event Temperature Windspeed
# 0 1/1/2018 Rain 32 6
# 1 1/2/2018 Sunny 35 7
# 2 1/3/2018 Snow 28 2
# 3 1/4/2018 Snow 24 7
# 4 1/5/2018 Rain 32 4
# 5 1/6/2018 Sunny 31 2