###############第1课: 从CSV读取 - 导出到CSV - 查找最大值 - 绘制数据############## from pandas import DataFrame, read_csv import matplotlib.pyplot as plt import pandas as pd import sys import matplotlib import numpy as np print('Python version ' + sys.version) print('Pandas version ' + pd.__version__) print('Matplotlib version ' + matplotlib.__version__) # 创建数据 # 数据集将包括5个宝宝的名字和当年记录的出生人数(1880年)。 # 初始宝宝的名字和出生率 names = ['Bob','Jessica','Mary','John','Mel'] births = [968, 155, 77, 578, 973] # 为了将这两个列表合并在一起,我们将使用zip函数。 BabyDataSet = list(zip(names,births)) BabyDataSet df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births']) df df.to_csv('births1880.csv',index=False,header=False) Location = r'E:PythonProjectbirths1880.csv' df = pd.read_csv(Location) df df = pd.read_csv(Location, names=['Names','Births']) df import os os.remove(Location) df.dtypes df.Births.dtype Sorted = df.sort_values(['Births'], ascending=False) Sorted.head(1) df['Births'].max() # 解释这些部分: # df ['Names'] - 这是婴儿名字的整个列表,整个名字列 # df ['Births'] - 这是1880年出生的整个列表,整个Births列 # df ['Births'].max() - 这是出生列中找到的最大值 # # [df ['Births'] == df ['Births'].max()] IS EQUAL TO [查找出生列中等于973的所有记录] # df ['Names'] [df [Births'] == df ['Births']。max()] 是等于选择Names列中的所有记录WHERE [Births列等于973] # Create graph df['Births'].plot() # Maximum value in the data set MaxValue = df['Births'].max() # Name associated with the maximum value MaxName = df['Names'][df['Births'] == df['Births'].max()].values # Text to display on graph Text = str(MaxValue) + " - " + MaxName # Add text to graph plt.annotate(Text, xy=(1, MaxValue), xytext=(8, 0), xycoords=('axes fraction', 'data'), textcoords='offset points') print("The most popular name") df[df['Births'] == df['Births'].max()] #Sorted.head(1) can also be used ###############2课:从TXT读取 - 导出到TXT - 选择顶部/底部记录 - 描述性统计 - 分组/排序数据############## # 1、准备数据 names = ['Bob','Jessica','Mary','John','Mel'] np.random.seed(500) random_names = [names[np.random.randint(low=0,high=len(names))] for i in range(1000)] random_names[:10] # The number of births per name for the year 1880 births = [np.random.randint(low=0,high=1000) for i in range(1000)] births[:10] BabyDataSet = list(zip(random_names,births)) BabyDataSet[:10] df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births']) df[:10] df.to_csv('births1880.txt',index=False,header=False) Location = r'E:PythonProjectbirths1880.txt' df = pd.read_csv(Location) df.info() df.head() df = pd.read_csv(Location, header=None) df.info() df.tail() df = pd.read_csv(Location, names=['Names','Births']) df.head(5) import os os.remove(Location) # 唯一属性来查找“名称”列的所有唯一记录 # Method 1: df['Names'].unique() # If you actually want to print the unique values: for x in df['Names'].unique(): print(x) # Method 2: print(df['Names'].describe()) # Create a groupby object # 由于每个宝贝名称都有多个值,因此我们需要汇总这些数据 name = df.groupby('Names') # Apply the sum function to the groupby object df = name.sum() df # Method 1: Sorted = df.sort_values(['Births'], ascending=False) Sorted.head(1) # Method 2: df['Births'].max() # Create graph df['Births'].plot.bar() print("The most popular name") df.sort_values(by='Births', ascending=False) ##############第三课、创建功能 - 从EXCEL读取 - 导出到EXCEL - 离群值 - Lambda函数 - 切片和切块数据 # set seed np.random.seed(111) # Function to generate test data def CreateDataSet(Number=1): Output = [] for i in range(Number): # Create a weekly (mondays) date range rng = pd.date_range(start='1/1/2009', end='12/31/2012', freq='W-MON') # Create random data data = np.random.randint(low=25, high=1000, size=len(rng)) # Status pool status = [1, 2, 3] # Make a random list of statuses random_status = [status[np.random.randint(low=0, high=len(status))] for i in range(len(rng))] # State pool states = ['GA', 'FL', 'fl', 'NY', 'NJ', 'TX'] # Make a random list of states random_states = [states[np.random.randint(low=0, high=len(states))] for i in range(len(rng))] Output.extend(zip(random_states, random_status, data, rng)) return Output dataset = CreateDataSet(4) df = pd.DataFrame(data=dataset, columns=['State','Status','CustomerCount','StatusDate']) df.info() df.head() df.to_excel('Lesson3.xlsx', index=False) print('Done') # Location of file Location = r'E:PythonProjectLesson3.xlsx' # Parse a specific sheet df = pd.read_excel(Location, 0, index_col='StatusDate') df.dtypes # Parse a specific sheet df = pd.read_excel(Location, 0, index_col='StatusDate') df.dtypes df.index df.head() # 保留数组中不同的值 df['State'].unique() # 要将所有状态值转换为大写 df['State'] = df.State.apply(lambda x: x.upper()) df['State'].unique() # Only grab where Status == 1 mask = df['Status'] == 1 df = df[mask] # Convert NJ to NY mask = df.State == 'NJ' df['State'][mask] = 'NY' df['State'].unique() df['CustomerCount'].plot(figsize=(15,5)); sortdf = df[df['State']=='NY'].sort_index(axis=0) sortdf.head(10) # Group by State and StatusDate Daily = df.reset_index().groupby(['State','StatusDate']).sum() Daily.head() Daily.index # Select the State index Daily.index.levels[0] Daily.index.levels[1] Daily.loc['FL'].plot() Daily.loc['GA'].plot() Daily.loc['NY'].plot() Daily.loc['TX'].plot() Daily.loc['FL']['2012':].plot() Daily.loc['GA']['2012':].plot() Daily.loc['NY']['2012':].plot() Daily.loc['TX']['2012':].plot() StateYearMonth = Daily.groupby([Daily.index.get_level_values(0), Daily.index.get_level_values(1).year, Daily.index.get_level_values(1).month]) Daily['Lower'] = StateYearMonth['CustomerCount'].transform( lambda x: x.quantile(q=.25) - (1.5*x.quantile(q=.75)-x.quantile(q=.25)) ) Daily['Upper'] = StateYearMonth['CustomerCount'].transform( lambda x: x.quantile(q=.75) + (1.5*x.quantile(q=.75)-x.quantile(q=.25)) ) Daily['Outlier'] = (Daily['CustomerCount'] < Daily['Lower']) | (Daily['CustomerCount'] > Daily['Upper']) # Remove Outliers Daily = Daily[Daily['Outlier'] == False] Daily.head() # Combine all markets # Get the max customer count by Date ALL = pd.DataFrame(Daily['CustomerCount'].groupby(Daily.index.get_level_values(1)).sum()) ALL.columns = ['CustomerCount'] # rename column # Group by Year and Month YearMonth = ALL.groupby([lambda x: x.year, lambda x: x.month]) # What is the max customer count per Year and Month ALL['Max'] = YearMonth['CustomerCount'].transform(lambda x: x.max()) ALL.head() # Create the BHAG dataframe data = [1000,2000,3000] idx = pd.date_range(start='12/31/2011', end='12/31/2013', freq='A') BHAG = pd.DataFrame(data, index=idx, columns=['BHAG']) BHAG # Combine the BHAG and the ALL data set combined = pd.concat([ALL,BHAG], axis=0) combined = combined.sort_index(axis=0) combined.tail() fig, axes = plt.subplots(figsize=(12, 7)) combined['BHAG'].fillna(method='pad').plot(color='green', label='BHAG') combined['Max'].plot(color='blue', label='All Markets') plt.legend(loc='best') # Group by Year and then get the max value per year Year = combined.groupby(lambda x: x.year).max() Year # Add a column representing the percent change per year Year['YR_PCT_Change'] = Year['Max'].pct_change(periods=1) Year (1 + Year.ix[2012,'YR_PCT_Change']) * Year.loc[2012,'Max'] # First Graph ALL['Max'].plot(figsize=(10, 5));plt.title('ALL Markets') # Last four Graphs fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 10)) fig.subplots_adjust(hspace=1.0) ## Create space between plots Daily.loc['FL']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[0,0]) Daily.loc['GA']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[0,1]) Daily.loc['TX']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[1,0]) Daily.loc['NY']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[1,1]) # Add titles axes[0,0].set_title('Florida') axes[0,1].set_title('Georgia') axes[1,0].set_title('Texas') axes[1,1].set_title('North East') ############第四课: - 添加/删除列 - 索引操作############### # Our small data set d = [0,1,2,3,4,5,6,7,8,9] # Create dataframe df = pd.DataFrame(d) df # Lets change the name of the column df.columns = ['Rev'] df # Lets add a column df['NewCol'] = 5 df # Lets modify our new column df['NewCol'] = df['NewCol'] + 1 df del df['NewCol'] df # Lets add a couple of columns df['test'] = 3 df['col'] = df['Rev'] df # If we wanted, we could change the name of the index i = ['a','b','c','d','e','f','g','h','i','j'] df.index = i df df.loc['a'] # df.loc[inclusive:inclusive] df.loc['a':'d'] # df.iloc[inclusive:exclusive] # Note: .iloc is strictly integer position based. It is available from [version 0.11.0] (http://pandas.pydata.org/pandas-docs/stable/whatsnew.html#v0-11-0-april-22-2013) df.iloc[0:3] df['Rev'] df[['Rev', 'test']] # df.ix[rows,columns] # replaces the deprecated ix function #df.ix[0:3,'Rev'] df.loc[df.index[0:3],'Rev'] # replaces the deprecated ix function #df.ix[5:,'col'] df.loc[df.index[5:],'col'] # replaces the deprecated ix function #df.ix[:3,['col', 'test']] df.loc[df.index[:3],['col', 'test']] # Select top N number of records (default = 5) df.head() # Select bottom N number of records (default = 5) df.tail() ##########第五课、堆栈/取消堆栈/移调功能######## # Our small data set d = {'one':[1,1],'two':[2,2]} i = ['a','b'] # Create dataframe df = pd.DataFrame(data = d, index = i) df df.index stack = df.stack() stack stack.index unstack = df.unstack() unstack unstack.index transpose = df.T transpose transpose.index #################第六课,gruop by################### # Our small data set d = {'one':[1,1,1,1,1], 'two':[2,2,2,2,2], 'letter':['a','a','b','b','c']} # Create dataframe df = pd.DataFrame(d) df # Create group object one = df.groupby('letter') # Apply sum function one.sum() letterone = df.groupby(['letter','one']).sum() letterone letterone.index letterone = df.groupby(['letter','one'], as_index=False).sum() letterone letterone.index #################第7课,计算离群值的方法################### # Create a dataframe with dates as your index States = ['NY', 'NY', 'NY', 'NY', 'FL', 'FL', 'GA', 'GA', 'FL', 'FL'] data = [1.0, 2, 3, 4, 5, 6, 7, 8, 9, 10] idx = pd.date_range('1/1/2012', periods=10, freq='MS') df1 = pd.DataFrame(data, index=idx, columns=['Revenue']) df1['State'] = States # Create a second dataframe data2 = [10.0, 10.0, 9, 9, 8, 8, 7, 7, 6, 6] idx2 = pd.date_range('1/1/2013', periods=10, freq='MS') df2 = pd.DataFrame(data2, index=idx2, columns=['Revenue']) df2['State'] = States # Combine dataframes df = pd.concat([df1,df2]) df # Method 1 # make a copy of original df newdf = df.copy() newdf['x-Mean'] = abs(newdf['Revenue'] - newdf['Revenue'].mean()) newdf['1.96*std'] = 1.96*newdf['Revenue'].std() newdf['Outlier'] = abs(newdf['Revenue'] - newdf['Revenue'].mean()) > 1.96*newdf['Revenue'].std() newdf # Method 2 # Group by item # make a copy of original df newdf = df.copy() State = newdf.groupby('State') newdf['Outlier'] = State.transform( lambda x: abs(x-x.mean()) > 1.96*x.std() ) newdf['x-Mean'] = State.transform( lambda x: abs(x-x.mean()) ) newdf['1.96*std'] = State.transform( lambda x: 1.96*x.std() ) newdf # Method 2 # Group by multiple items # make a copy of original df newdf = df.copy() StateMonth = newdf.groupby(['State', lambda x: x.month]) newdf['Outlier'] = StateMonth.transform( lambda x: abs(x-x.mean()) > 1.96*x.std() ) newdf['x-Mean'] = StateMonth.transform( lambda x: abs(x-x.mean()) ) newdf['1.96*std'] = StateMonth.transform( lambda x: 1.96*x.std() ) newdf # Method 3 # Group by item # make a copy of original df newdf = df.copy() State = newdf.groupby('State') def s(group): group['x-Mean'] = abs(group['Revenue'] - group['Revenue'].mean()) group['1.96*std'] = 1.96*group['Revenue'].std() group['Outlier'] = abs(group['Revenue'] - group['Revenue'].mean()) > 1.96*group['Revenue'].std() return group Newdf2 = State.apply(s) Newdf2 # make a copy of original df newdf = df.copy() State = newdf.groupby('State') newdf['Lower'] = State['Revenue'].transform( lambda x: x.quantile(q=.25) - (1.5*(x.quantile(q=.75)-x.quantile(q=.25))) ) newdf['Upper'] = State['Revenue'].transform( lambda x: x.quantile(q=.75) + (1.5*(x.quantile(q=.75)-x.quantile(q=.25))) ) newdf['Outlier'] = (newdf['Revenue'] < newdf['Lower']) | (newdf['Revenue'] > newdf['Upper']) newdf ################第八课: 从MySQL数据库读取###################3 from sqlalchemy import create_engine, MetaData, Table, select, engine # Parameters TableName = "employees" # Create the connection engine=create_engine("mysql+pymysql://root:root@localhost:3306/test") conn = engine.connect() # Required for querying tables metadata = MetaData(conn) # Table to query tbl = Table(TableName, metadata, autoload=True) #tbl.create(checkfirst=True) # Select all sql = tbl.select() # run sql code result = conn.execute(sql) result # Insert to a dataframe df = pd.DataFrame(data=list(result), columns=result.keys()) df # Close connection conn.close() print('Done') # 第九课: - 导出为CSV / EXCEL / TXT df.to_csv('DimDate.csv', index=False) print('Done') df.to_excel('DimDate.xls', index=False) print('Done') df.to_csv('DimDate.txt', index=False) print('Done') # 第十课: - 在不同格式之间转换 # Create DataFrame d = [1,2,3,4,5,6,7,8,9] df = pd.DataFrame(d, columns = ['Number']) df # Export to Excel df.to_excel('Lesson10.xlsx', sheet_name = 'testing', index = False) print('Done') # Path to excel file # Your path will be different, please modify the path below. location = r'E:PythonProjectLesson10.xlsx' # Parse the excel file df = pd.read_excel(location, 0) df.head() df.dtypes df.tail() df.to_json('Lesson10.json') print('Done') # Your path will be different, please modify the path below. jsonloc = r'E:PythonProjectLesson10.json' # read json file df2 = pd.read_json(jsonloc) df2.dtypes # 第十一课: - 结合各种来源的数据 # Create DataFrame d = {'Channel':[1], 'Number':[255]} df = pd.DataFrame(d) df # Export to Excel df.to_excel('test1.xlsx', sheet_name = 'test1', index = False) df.to_excel('test2.xlsx', sheet_name = 'test2', index = False) df.to_excel('test3.xlsx', sheet_name = 'test3', index = False) print('Done') # List to hold file names FileNames = [] # Your path will be different, please modify the path below. os.chdir(r"E:PythonProject") # Find any file that ends with ".xlsx" for files in os.listdir("."): if files.endswith(".xlsx"): FileNames.append(files) FileNames def GetFile(fnombre): # Path to excel file # Your path will be different, please modify the path below. location = r'E:PythonProject\' + fnombre # Parse the excel file # 0 = first sheet df = pd.read_excel(location, 0) # Tag record to file name df['File'] = fnombre # Make the "File" column the index of the df return df.set_index(['File']) # Create a list of dataframes df_list = [GetFile(fname) for fname in FileNames] df_list # Combine all of the dataframes into one big_df = pd.concat(df_list) big_df big_df.dtypes # Plot it! big_df['Channel'].plot.bar() # Python实用的数据分析 # 1、数据传输 import numpy as np import pandas as pd ver=pd.read_csv("ver.csv") pd.set_option('display.max_columns', 80) ver.head(3) ver.shape len(ver) ver.columns ver['action_taken'][:5] incomeranges = pd.cut(ver['applicant_income_000s'], 14) incomeranges[:5] pd.value_counts(incomeranges) ver.ix[0,0:6] ver['loan_amount_000s'].order()[:5] sorteddata = ver.sort(['loan_amount_000s']) sorteddata.ix[:,0:6].head(3) sorteddata.iloc[0:3,0:3] ver['action_taken_name'].value_counts() zip(ver.columns, [type(x) for x in ver.ix[0,:]]) ver.dtypes ver['county_name'].unique() len(ver['county_name'].unique()) ver.ix[0:3,'preapproval_name'] ver.ix[0:3,'preapproval_name'] == "Preapproval was requested" # 2、汇总数据 ver=pd.read_csv("ver.csv") melt = pd.melt(ver, id_vars = 'loan_purpose_name') melt.iloc[0:5,:] melt = pd.melt(ver, id_vars = 'county_name') melt.iloc[0:5,:] ver.describe() pd.crosstab(ver['county_name'],ver['action_taken_name']) incomesubset = ver[(ver['applicant_income_000s'] > 0 ) & (ver['applicant_income_000s'] < 1000)] incomesubset incomesubset.shape qry1 = ver.query('(applicant_income_000s > 0) & (applicant_income_000s < 1000)') qry1.head(10) qry1.shape grouped1 = ver.groupby(['applicant_race_name_1','loan_purpose_name']).mean() grouped1 grouped1["action_taken_name"].unique()