import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import sys
from statsmodels.stats.diagnostic import normal_ad

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")


from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


%cd drive/MyDrive/dataset/uber-and-lyft-dataset-boston-ma

/content/drive/MyDrive/dataset/uber-and-lyft-dataset-boston-ma


df = pd.read_csv('rideshare_kaggle.csv')
df


#Mengetahui jumlah kolom, serta tipe data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 693071 entries, 0 to 693070
Data columns (total 57 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           693071 non-null  object 
 1   timestamp                    693071 non-null  float64
 2   hour                         693071 non-null  int64  
 3   day                          693071 non-null  int64  
 4   month                        693071 non-null  int64  
 5   datetime                     693071 non-null  object 
 6   timezone                     693071 non-null  object 
 7   source                       693071 non-null  object 
 8   destination                  693071 non-null  object 
 9   cab_type                     693071 non-null  object 
 10  product_id                   693071 non-null  object 
 11  name                         693071 non-null  object 
 12  price                        637976 non-null  float64
 13  distance                     693071 non-null  float64
 14  surge_multiplier             693071 non-null  float64
 15  latitude                     693071 non-null  float64
 16  longitude                    693071 non-null  float64
 17  temperature                  693071 non-null  float64
 18  apparentTemperature          693071 non-null  float64
 19  short_summary                693071 non-null  object 
 20  long_summary                 693071 non-null  object 
 21  precipIntensity              693071 non-null  float64
 22  precipProbability            693071 non-null  float64
 23  humidity                     693071 non-null  float64
 24  windSpeed                    693071 non-null  float64
 25  windGust                     693071 non-null  float64
 26  windGustTime                 693071 non-null  int64  
 27  visibility                   693071 non-null  float64
 28  temperatureHigh              693071 non-null  float64
 29  temperatureHighTime          693071 non-null  int64  
 30  temperatureLow               693071 non-null  float64
 31  temperatureLowTime           693071 non-null  int64  
 32  apparentTemperatureHigh      693071 non-null  float64
 33  apparentTemperatureHighTime  693071 non-null  int64  
 34  apparentTemperatureLow       693071 non-null  float64
 35  apparentTemperatureLowTime   693071 non-null  int64  
 36  icon                         693071 non-null  object 
 37  dewPoint                     693071 non-null  float64
 38  pressure                     693071 non-null  float64
 39  windBearing                  693071 non-null  int64  
 40  cloudCover                   693071 non-null  float64
 41  uvIndex                      693071 non-null  int64  
 42  visibility.1                 693071 non-null  float64
 43  ozone                        693071 non-null  float64
 44  sunriseTime                  693071 non-null  int64  
 45  sunsetTime                   693071 non-null  int64  
 46  moonPhase                    693071 non-null  float64
 47  precipIntensityMax           693071 non-null  float64
 48  uvIndexTime                  693071 non-null  int64  
 49  temperatureMin               693071 non-null  float64
 50  temperatureMinTime           693071 non-null  int64  
 51  temperatureMax               693071 non-null  float64
 52  temperatureMaxTime           693071 non-null  int64  
 53  apparentTemperatureMin       693071 non-null  float64
 54  apparentTemperatureMinTime   693071 non-null  int64  
 55  apparentTemperatureMax       693071 non-null  float64
 56  apparentTemperatureMaxTime   693071 non-null  int64  
dtypes: float64(29), int64(17), object(11)
memory usage: 301.4+ MB


df.dtypes.value_counts()

float64    29
int64      17
object     11
dtype: int64


#Mengecek kolom yang bertipe numerik
df.describe()


df.duplicated().sum()

0


df_uber = df[(df['cab_type']=='Uber')]
df_lyft = df[(df['cab_type']=='Lyft')]


df.isna().sum()

id                                 0
timestamp                          0
hour                               0
day                                0
month                              0
datetime                           0
timezone                           0
source                             0
destination                        0
cab_type                           0
product_id                         0
name                               0
price                          55095
distance                           0
surge_multiplier                   0
latitude                           0
longitude                          0
temperature                        0
apparentTemperature                0
short_summary                      0
long_summary                       0
precipIntensity                    0
precipProbability                  0
humidity                           0
windSpeed                          0
windGust                           0
windGustTime                       0
visibility                         0
temperatureHigh                    0
temperatureHighTime                0
temperatureLow                     0
temperatureLowTime                 0
apparentTemperatureHigh            0
apparentTemperatureHighTime        0
apparentTemperatureLow             0
apparentTemperatureLowTime         0
icon                               0
dewPoint                           0
pressure                           0
windBearing                        0
cloudCover                         0
uvIndex                            0
visibility.1                       0
ozone                              0
sunriseTime                        0
sunsetTime                         0
moonPhase                          0
precipIntensityMax                 0
uvIndexTime                        0
temperatureMin                     0
temperatureMinTime                 0
temperatureMax                     0
temperatureMaxTime                 0
apparentTemperatureMin             0
apparentTemperatureMinTime         0
apparentTemperatureMax             0
apparentTemperatureMaxTime         0
dtype: int64


#Filling missing data in "price" column by it's median
df['price'].fillna(df['price'].median(), inplace=True)


mean_uber = df_uber['price'].mean()
stdev_uber = df_uber['price'].std()

mean_lyft = df_lyft['price'].mean()
stdev_lyft = df_lyft['price'].std()


data = {"Uber":[df_uber['price'].mean(), df_lyft['price'].mean()],
        "Lyft":[df_lyft['price'].mean(), df_lyft['price'].std(),]
        };

index = ["Mean", "Standard Deviation"];     

dataFrame = pd.DataFrame(data=data, index=index);

dataFrame.plot.bar(rot=0,title="Perbandingan Rata-rata dan Standar Deviasi dari Harga Uber vs Lyft", color=['crimson','steelblue'],figsize=(10,5));
plt.gca().legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show(block=True);


df.hour.max()

23


df.hour.min()

0


df.loc[(0<=df['hour']) & (df['hour']<=3), 'hourGrouped'] = '00-03' 
df.loc[(4<=df['hour']) & (df['hour']<=6), 'hourGrouped'] = '04-06'
df.loc[(7<=df['hour']) & (df['hour']<=9), 'hourGrouped'] = '07-09'
df.loc[(10<=df['hour']) & (df['hour']<=12), 'hourGrouped'] = '10-12'
df.loc[(13<=df['hour']) & (df['hour']<=15), 'hourGrouped'] = '13-15'
df.loc[(16<=df['hour']) & (df['hour']<=18), 'hourGrouped'] = '16-18'
df.loc[(19<=df['hour']) & (df['hour']<=21), 'hourGrouped'] = '19-21'
df.loc[(22<=df['hour']) & (df['hour']<=24), 'hourGrouped'] = '22-24'


dv11 = df[(df['hourGrouped']=='00-03') & (df['cab_type']=='Uber')]
dv12 = df[(df['hourGrouped']=='04-06') & (df['cab_type']=='Uber')]
dv13 = df[(df['hourGrouped']=='07-09') & (df['cab_type']=='Uber')]
dv14 = df[(df['hourGrouped']=='10-12') & (df['cab_type']=='Uber')]
dv15 = df[(df['hourGrouped']=='13-15') & (df['cab_type']=='Uber')]
dv16 = df[(df['hourGrouped']=='16-18') & (df['cab_type']=='Uber')]
dv17 = df[(df['hourGrouped']=='19-21') & (df['cab_type']=='Uber')]
dv18 = df[(df['hourGrouped']=='22-24') & (df['cab_type']=='Uber')]


dv19 = df[(df['hourGrouped']=='00-03') & (df['cab_type']=='Lyft')]
dv110 = df[(df['hourGrouped']=='04-06') & (df['cab_type']=='Lyft')]
dv111 = df[(df['hourGrouped']=='07-09') & (df['cab_type']=='Lyft')]
dv112 = df[(df['hourGrouped']=='10-12') & (df['cab_type']=='Lyft')]
dv113 = df[(df['hourGrouped']=='13-15') & (df['cab_type']=='Lyft')]
dv114 = df[(df['hourGrouped']=='16-18') & (df['cab_type']=='Lyft')]
dv115 = df[(df['hourGrouped']=='19-21') & (df['cab_type']=='Lyft')]
dv116 = df[(df['hourGrouped']=='22-24') & (df['cab_type']=='Lyft')]


data = {"Uber mean":[dv11['price'].mean(), dv12['price'].mean(), dv13['price'].mean(), 
                     dv14['price'].mean(), dv15['price'].mean(), dv16['price'].mean(),
                     dv17['price'].mean(),dv18['price'].mean()],
        "Lyft mean":[dv19['price'].mean(), dv110['price'].mean(), dv111['price'].mean(),
                     dv112['price'].mean(), dv113['price'].mean(), dv114['price'].mean(),
                     dv115['price'].mean(), dv116['price'].mean()]
        };

index = ["00-03", "04-06",'07-09','10-12','13-15','16-18','19-21','22-24'];     

dataFrame = pd.DataFrame(data=data, index=index);

dataFrame.plot.bar(rot=0,title="Rata-rata Harga Uber vs Lyft Per 3 Jam", color=['#f48668','#73a580'],figsize=(15,5));
plt.gca().legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show(block=True);


df.temperature.min()

18.91


df.temperature.max()

57.22


df.loc[(15<=df['temperature']) & (df['temperature']<20), 'temperatureGrouped'] = '15-20' 
df.loc[(20<=df['temperature']) & (df['temperature']<25), 'temperatureGrouped'] = '20-25'
df.loc[(25<=df['temperature']) & (df['temperature']<30), 'temperatureGrouped'] = '25-30'
df.loc[(30<=df['temperature']) & (df['temperature']<35), 'temperatureGrouped'] = '30-35'
df.loc[(35<=df['temperature']) & (df['temperature']<40), 'temperatureGrouped'] = '35-40'
df.loc[(40<=df['temperature']) & (df['temperature']<45), 'temperatureGrouped'] = '40-45'
df.loc[(45<=df['temperature']) & (df['temperature']<50), 'temperatureGrouped'] = '45-50'
df.loc[(50<=df['temperature']) & (df['temperature']<55), 'temperatureGrouped'] = '50-55'
df.loc[(55<=df['temperature']) & (df['temperature']<60), 'temperatureGrouped'] = '55-60'


dv21 = df[(df['temperatureGrouped']=='15-20') & (df['cab_type']=='Uber')]
dv22 = df[(df['temperatureGrouped']=='20-25') & (df['cab_type']=='Uber')]
dv23 = df[(df['temperatureGrouped']=='25-30') & (df['cab_type']=='Uber')]
dv24 = df[(df['temperatureGrouped']=='30-35') & (df['cab_type']=='Uber')]
dv25 = df[(df['temperatureGrouped']=='35-40') & (df['cab_type']=='Uber')]
dv26 = df[(df['temperatureGrouped']=='40-45') & (df['cab_type']=='Uber')]
dv27 = df[(df['temperatureGrouped']=='45-50') & (df['cab_type']=='Uber')]
dv28 = df[(df['temperatureGrouped']=='50-55') & (df['cab_type']=='Uber')]
dv29 = df[(df['temperatureGrouped']=='553. Mengumpulkan data untuk pengguna Uber-60') & (df['cab_type']=='Uber')]


dv210 = df[(df['temperatureGrouped']=='15-20') & (df['cab_type']=='Lyft')]
dv211 = df[(df['temperatureGrouped']=='20-25') & (df['cab_type']=='Lyft')]
dv212 = df[(df['temperatureGrouped']=='25-30') & (df['cab_type']=='Lyft')]
dv213 = df[(df['temperatureGrouped']=='30-35') & (df['cab_type']=='Lyft')]
dv214 = df[(df['temperatureGrouped']=='35-40') & (df['cab_type']=='Lyft')]
dv215 = df[(df['temperatureGrouped']=='40-45') & (df['cab_type']=='Lyft')]
dv216 = df[(df['temperatureGrouped']=='45-50') & (df['cab_type']=='Lyft')]
dv217 = df[(df['temperatureGrouped']=='50-55') & (df['cab_type']=='Lyft')]
dv218 = df[(df['temperatureGrouped']=='55-60') & (df['cab_type']=='Lyft')]


data = {"Uber mean":[dv21['price'].mean(), dv22['price'].mean(), dv23['price'].mean(), 
                     dv24['price'].mean(), dv25['price'].mean(), dv26['price'].mean(),
                     dv27['price'].mean(),dv28['price'].mean(),dv29['price'].mean()],
        "Lyft mean":[dv210['price'].mean(), dv211['price'].mean(),dv212['price'].mean(), 
                     dv213['price'].mean(), dv214['price'].mean(),dv215['price'].mean(), 
                     dv216['price'].mean(),dv217['price'].mean(),dv218['price'].mean(),  ]
        };

index = ["15-20", "20-25",'25-30','30-35','35-40','40-45','45-50','50-55','55-60'];     

dataFrame = pd.DataFrame(data=data, index=index);

dataFrame.plot.bar(rot=0,title="Rata-rata Harga Uber vs Lyft Per 5 Derajat Temperatur", color=['darkorange','royalblue'],figsize=(15,5));
plt.gca().legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show(block=True);


dv41 = df_uber[['source','destination']]
dv42 = df_lyft[['source','destination']]


colors1 = sns.color_palette("Spectral",12)
colors2 = sns.color_palette("rainbow",12)


dv411 = dv41.groupby(['source'],as_index=True).agg({'source':'count'}, index=False)
dv412 = dv41.groupby(['destination'],as_index=True).agg({'source':'count'}, index=False)
dv413 = dv42.groupby(['source'],as_index=True).agg({'source':'count'}, index=False)
dv414 = dv42.groupby(['destination'],as_index=True).agg({'source':'count'}, index=False)


dv411.plot(kind='pie',figsize=(12, 7),autopct='%1.4f%%',startangle=90,shadow=True,subplots=True,colors=colors1,
           textprops={'fontsize': 8},labels=dv411.index,legend=False,wedgeprops={'linewidth': 2.0, 'edgecolor': 'white'})
plt.title('Proporsi Source Uber', loc='center',size ='15')
plt.axis('off')
plt.show()


dv412.plot(kind='pie',figsize=(12, 7),autopct='%1.4f%%',startangle=90,shadow=True,subplots=True,colors=colors1,
           textprops={'fontsize': 8},labels=dv411.index,legend=False,wedgeprops={'linewidth': 2.0, 'edgecolor': 'white'})
plt.title('Proporsi Destination Uber', loc='center',size ='15')
plt.axis('off')
plt.show()


dv413.plot(kind='pie',figsize=(12, 7),autopct='%1.4f%%',startangle=90,shadow=True,subplots=True,colors=colors2,
           textprops={'fontsize': 8},labels=dv411.index,legend=False,wedgeprops={'linewidth': 2.0, 'edgecolor': 'white'})
plt.title('Proporsi Source Lyft', loc='center',size ='15')
plt.axis('off')
plt.show()


dv414.plot(kind='pie',figsize=(12, 7),autopct='%1.4f%%',startangle=90,shadow=True,subplots=True,colors=colors2,
           textprops={'fontsize': 8},labels=dv411.index,legend=False,wedgeprops={'linewidth': 2.0, 'edgecolor': 'white'})
plt.title('Proporsi Destination Lyft', loc='center',size ='15')
plt.axis('off')
plt.show()


grid1 = sns.FacetGrid(df, col='cab_type', height=5, aspect=1.6)
grid1.map(sns.distplot, 'price',bins=50, color = 'c')
plt.subplots_adjust(top=0.85)
grid1.fig.suptitle('Distribusi dari Harga Uber vs Lyft')

/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Text(0.5, 0.98, 'Distribusi dari Harga Uber vs Lyft')


# Performing the test on the Uber Price
p_value = normal_ad(df_uber['price'])[1]
print('p-value Uber Price from the test Anderson-Darling test below 0.05 generally means non-normal:', p_value)

# Reporting the normality of the Uber Price
if p_value < 0.05:
    print('Harga Uber tidak normally distributed','\n')
else:
    print('Harga Uber normally distributed','\n')
    
# Performing the test on the Lyft Price    
p_value1 = normal_ad(df_lyft['price'])[1]
print('p-value Lyft Price from the test Anderson-Darling test below 0.05 generally means non-normal:', p_value1)

# Reporting the normality of the Lyft Price
if p_value1 < 0.05:
    print('Harga Lyft tidak normally distributed')
else:
    print('Harga Lyft normally distributed')

p-value Uber Price from the test Anderson-Darling test below 0.05 generally means non-normal: 0.0
Harga Uber tidak normally distributed 

p-value Lyft Price from the test Anderson-Darling test below 0.05 generally means non-normal: 0.0
Harga Lyft tidak normally distributed


df_cor = df.drop(['id','timestamp','day','month','datetime','timezone',
               'source','destination','cab_type','product_id','name',
               'latitude','longitude','apparentTemperature','precipProbability',
               'windGustTime','temperatureHighTime','temperatureLowTime',
               'apparentTemperatureHighTime','apparentTemperatureHighTime',
               'visibility.1','uvIndexTime','short_summary','long_summary',
               'icon','sunriseTime','sunsetTime','moonPhase',
               'temperatureMinTime','temperatureMaxTime','apparentTemperatureMax',
               'apparentTemperatureMaxTime','apparentTemperatureMinTime'], axis=1)
df_cor


fig, ax = plt.subplots(figsize=(30,30))
sns.heatmap(df_cor.corr(), annot=True, fmt='.2%',annot_kws={"size": 10},cmap="inferno")
plt.title("Korelasi Antar Variabel", loc='center',size ='15')
plt.show()


df1 = df[['price','distance','surge_multiplier','cab_type','name']]
df1


cat_col = df1.select_dtypes(include=['object','category']).columns.tolist()
print(cat_col)

['cab_type', 'name']


for col in cat_col:
    encoder = OneHotEncoder(handle_unknown='ignore')
    enc_df = pd.DataFrame(encoder.fit_transform(df1[[col]]).toarray())
    enc_df.columns = encoder.get_feature_names([col])
    df1 = df1.drop(col, axis=1)
    df1 = pd.concat([df1, enc_df], axis=1)

/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
  warnings.warn(msg, category=FutureWarning)
/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
  warnings.warn(msg, category=FutureWarning)

df1


train, test = train_test_split(df1, test_size=0.25, random_state=2)
train_index = train.index
test_index = test.index


x_train = train.drop(['price'],axis=1)
y_train = train[['price']]


x_test = test.drop(['price'],axis=1)
y_test = test[['price']]


scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)


x_test = scaler.transform(x_test)


lr = LinearRegression()
model = lr.fit(x_train, y_train)  #create model dari LR dari X dan y (berdasarkan data yang dimiliki)


r_sq = model.score(x_train,y_train)
print('Coefficient of determination:',r_sq)
print('Intercept:',model.intercept_)
print('Slope:',model.coef_)

Coefficient of determination: 0.91784152269563
Intercept: [16.29724559]
Slope: [[ 2.90871711e+00  1.68007553e+00 -2.21755320e+12  1.28958259e+12
  -1.58114609e+12 -1.57943382e+12  3.18597766e+11  3.18472152e+11
   3.18014795e+11  3.18525451e+11  3.18996996e+11  3.18087270e+11
  -1.58116355e+12 -1.58063967e+12 -1.58159993e+12 -1.57827918e+12
  -1.57529959e+12]]


model.coef_[0][12]

-1581163549068.1245


model.coef_[0][13]

-1580639670825.4922


model.coef_[0][16]

-1575299589469.0164


y_pred = model.predict(x_train)
train['Estimated Y'] = np.round(y_pred,2)
train


y_pred = model.predict(x_test)
test['Estimated Y'] = np.round(y_pred,2)
test


model.score(x_test, y_test)

0.9188741580400912


compared = pd.DataFrame({'Keterangan':['Training Data','Testing Data'],'Tingkat Akurasi':[model.score(x_train,y_train), model.score(x_test, y_test)]})
compared

	id	timestamp	hour	day	month	datetime	timezone	source	destination	cab_type	...	precipIntensityMax	uvIndexTime	temperatureMin	temperatureMinTime	temperatureMax	temperatureMaxTime	apparentTemperatureMin	apparentTemperatureMinTime	apparentTemperatureMax	apparentTemperatureMaxTime
0	424553bb-7174-41ea-aeb4-fe06d4f4b9d7	1.544953e+09	9	16	12	2018-12-16 09:30:07	America/New_York	Haymarket Square	North Station	Lyft	...	0.1276	1544979600	39.89	1545012000	43.68	1544968800	33.73	1545012000	38.07	1544958000
1	4bd23055-6827-41c6-b23b-3c491f24e74d	1.543284e+09	2	27	11	2018-11-27 02:00:23	America/New_York	Haymarket Square	North Station	Lyft	...	0.1300	1543251600	40.49	1543233600	47.30	1543251600	36.20	1543291200	43.92	1543251600
2	981a3613-77af-4620-a42a-0c0866077d1e	1.543367e+09	1	28	11	2018-11-28 01:00:22	America/New_York	Haymarket Square	North Station	Lyft	...	0.1064	1543338000	35.36	1543377600	47.55	1543320000	31.04	1543377600	44.12	1543320000
3	c2d88af2-d278-4bfd-a8d0-29ca77cc5512	1.543554e+09	4	30	11	2018-11-30 04:53:02	America/New_York	Haymarket Square	North Station	Lyft	...	0.0000	1543507200	34.67	1543550400	45.03	1543510800	30.30	1543550400	38.53	1543510800
4	e0126e1f-8ca9-4f2e-82b3-50505a09db9a	1.543463e+09	3	29	11	2018-11-29 03:49:20	America/New_York	Haymarket Square	North Station	Lyft	...	0.0001	1543420800	33.10	1543402800	42.18	1543420800	29.11	1543392000	35.75	1543420800
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
693066	616d3611-1820-450a-9845-a9ff304a4842	1.543708e+09	23	1	12	2018-12-01 23:53:05	America/New_York	West End	North End	Uber	...	0.0000	1543683600	31.42	1543658400	44.76	1543690800	27.77	1543658400	44.09	1543690800
693067	633a3fc3-1f86-4b9e-9d48-2b7132112341	1.543708e+09	23	1	12	2018-12-01 23:53:05	America/New_York	West End	North End	Uber	...	0.0000	1543683600	31.42	1543658400	44.76	1543690800	27.77	1543658400	44.09	1543690800
693068	64d451d0-639f-47a4-9b7c-6fd92fbd264f	1.543708e+09	23	1	12	2018-12-01 23:53:05	America/New_York	West End	North End	Uber	...	0.0000	1543683600	31.42	1543658400	44.76	1543690800	27.77	1543658400	44.09	1543690800
693069	727e5f07-a96b-4ad1-a2c7-9abc3ad55b4e	1.543708e+09	23	1	12	2018-12-01 23:53:05	America/New_York	West End	North End	Uber	...	0.0000	1543683600	31.42	1543658400	44.76	1543690800	27.77	1543658400	44.09	1543690800
693070	e7fdc087-fe86-40a5-a3c3-3b2a8badcbda	1.543708e+09	23	1	12	2018-12-01 23:53:05	America/New_York	West End	North End	Uber	...	0.0000	1543683600	31.42	1543658400	44.76	1543690800	27.77	1543658400	44.09	1543690800

	timestamp	hour	day	month	price	distance	surge_multiplier	latitude	longitude	temperature	...	precipIntensityMax	uvIndexTime	temperatureMin	temperatureMinTime	temperatureMax	temperatureMaxTime	apparentTemperatureMin	apparentTemperatureMinTime	apparentTemperatureMax	apparentTemperatureMaxTime
count	6.930710e+05	693071.000000	693071.000000	693071.000000	637976.000000	693071.000000	693071.000000	693071.000000	693071.000000	693071.000000	...	693071.000000	6.930710e+05	693071.000000	6.930710e+05	693071.000000	6.930710e+05	693071.000000	6.930710e+05	693071.000000	6.930710e+05
mean	1.544046e+09	11.619137	17.794365	11.586684	16.545125	2.189430	1.013870	42.338172	-71.066151	39.584388	...	0.037374	1.544044e+09	33.457774	1.544042e+09	45.261313	1.544047e+09	29.731002	1.544048e+09	41.997343	1.544048e+09
std	6.891925e+05	6.948114	9.982286	0.492429	9.324359	1.138937	0.091641	0.047840	0.020302	6.726084	...	0.055214	6.912028e+05	6.467224	6.901954e+05	5.645046	6.901353e+05	7.110494	6.871862e+05	6.936841	6.910777e+05
min	1.543204e+09	0.000000	1.000000	11.000000	2.500000	0.020000	1.000000	42.214800	-71.105400	18.910000	...	0.000000	1.543162e+09	15.630000	1.543122e+09	33.510000	1.543154e+09	11.810000	1.543136e+09	28.950000	1.543187e+09
25%	1.543444e+09	6.000000	13.000000	11.000000	9.000000	1.280000	1.000000	42.350300	-71.081000	36.450000	...	0.000000	1.543421e+09	30.170000	1.543399e+09	42.570000	1.543439e+09	27.760000	1.543399e+09	36.570000	1.543439e+09
50%	1.543737e+09	12.000000	17.000000	12.000000	13.500000	2.160000	1.000000	42.351900	-71.063100	40.490000	...	0.000400	1.543770e+09	34.240000	1.543727e+09	44.680000	1.543788e+09	30.130000	1.543745e+09	40.950000	1.543788e+09
75%	1.544828e+09	18.000000	28.000000	12.000000	22.500000	2.920000	1.000000	42.364700	-71.054200	43.580000	...	0.091600	1.544807e+09	38.880000	1.544789e+09	46.910000	1.544814e+09	35.710000	1.544789e+09	44.120000	1.544818e+09
max	1.545161e+09	23.000000	30.000000	12.000000	97.500000	7.860000	3.000000	42.366100	-71.033000	57.220000	...	0.145900	1.545152e+09	43.100000	1.545192e+09	57.870000	1.545109e+09	40.050000	1.545134e+09	57.200000	1.545109e+09

	hour	price	distance	surge_multiplier	temperature	precipIntensity	humidity	windSpeed	windGust	visibility	...	windBearing	cloudCover	uvIndex	ozone	precipIntensityMax	temperatureMin	temperatureMax	apparentTemperatureMin	hourGrouped	temperatureGrouped
0	9	5.0	0.44	1.0	42.34	0.0000	0.68	8.66	9.17	10.000	...	57	0.72	0	303.8	0.1276	39.89	43.68	33.73	07-09	40-45
1	2	11.0	0.44	1.0	43.58	0.1299	0.94	11.98	11.98	4.786	...	90	1.00	0	291.1	0.1300	40.49	47.30	36.20	00-03	40-45
2	1	7.0	0.44	1.0	38.33	0.0000	0.75	7.33	7.33	10.000	...	240	0.03	0	315.7	0.1064	35.36	47.55	31.04	00-03	35-40
3	4	26.0	0.44	1.0	34.38	0.0000	0.73	5.28	5.28	10.000	...	310	0.00	0	291.1	0.0000	34.67	45.03	30.30	04-06	30-35
4	3	9.0	0.44	1.0	37.44	0.0000	0.70	9.14	9.14	10.000	...	303	0.44	0	347.7	0.0001	33.10	42.18	29.11	00-03	35-40
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
693066	23	13.0	1.00	1.0	37.05	0.0000	0.74	2.34	2.87	9.785	...	133	0.31	0	271.5	0.0000	31.42	44.76	27.77	22-24	35-40
693067	23	9.5	1.00	1.0	37.05	0.0000	0.74	2.34	2.87	9.785	...	133	0.31	0	271.5	0.0000	31.42	44.76	27.77	22-24	35-40
693068	23	13.5	1.00	1.0	37.05	0.0000	0.74	2.34	2.87	9.785	...	133	0.31	0	271.5	0.0000	31.42	44.76	27.77	22-24	35-40
693069	23	27.0	1.00	1.0	37.05	0.0000	0.74	2.34	2.87	9.785	...	133	0.31	0	271.5	0.0000	31.42	44.76	27.77	22-24	35-40
693070	23	10.0	1.00	1.0	37.05	0.0000	0.74	2.34	2.87	9.785	...	133	0.31	0	271.5	0.0000	31.42	44.76	27.77	22-24	35-40

	price	distance	surge_multiplier	cab_type	name
0	5.0	0.44	1.0	Lyft	Shared
1	11.0	0.44	1.0	Lyft	Lux
2	7.0	0.44	1.0	Lyft	Lyft
3	26.0	0.44	1.0	Lyft	Lux Black XL
4	9.0	0.44	1.0	Lyft	Lyft XL
...	...	...	...	...	...
693066	13.0	1.00	1.0	Uber	UberXL
693067	9.5	1.00	1.0	Uber	UberX
693068	13.5	1.00	1.0	Uber	Taxi
693069	27.0	1.00	1.0	Uber	Black SUV
693070	10.0	1.00	1.0	Uber	UberPool

FINAL PROJECT 1: LINEAR REGRESSION¶

A. Perkenalan

B.Import Package

C. Data Loading

Read Dataframe

General Info of Dataframe

Checking Duplicate Data

Filtering Uber and Lyft Data

D. Data Cleaning

E. Explorasi Data

Rata-rata dan Standar Deviasi Harga Uber vs Lyft

Rata-rata Harga Uber vs Lyft Per 3 Jam

Rata-rata Harga Uber vs Lyft Per 5 Derajat Temperatur

Proporsi Source dan Destination dari Uber vs Lyft

Distribusi dari Harga Uber vs Lyft

Overall Correlations

F. Data Preprocessing

Encode Data

Split Data

Scale Data

G. Pendefinisian dan Pelatihan Model

Create and Fit Model

H. Evaluasi Model

Get Results

Predict Response

I. Model Inference

Predict Testing Data

Comparison between Testing Data and Training Data

K. Pengambilan Kesimpulan

	price	distance	surge_multiplier	cab_type_Lyft	cab_type_Uber	name_Black	name_Black SUV	name_Lux	name_Lux Black	name_Lux Black XL	name_Lyft	name_Lyft XL	name_Shared	name_Taxi	name_UberPool	name_UberX	name_UberXL	name_WAV	Estimated Y
27838	22.5	2.04	1.0	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	21.98
571208	16.5	2.66	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	16.89
281888	10.5	3.06	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	11.14
431393	10.5	4.45	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	11.82
539045	7.0	1.20	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	6.23
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
84434	65.0	3.22	2.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	52.69
437782	18.5	3.06	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	17.91
620104	27.5	0.98	1.0	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	27.20
203245	10.0	1.45	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	6.87
100879	30.0	3.48	1.5	1.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	29.59

	price	distance	surge_multiplier	cab_type_Lyft	cab_type_Uber	name_Black	name_Black SUV	name_Lux	name_Lux Black	name_Lux Black XL	name_Lyft	name_Lyft XL	name_Shared	name_Taxi	name_UberPool	name_UberX	name_UberXL	name_WAV	Estimated Y
434738	14.0	2.35	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	16.10
458961	34.0	4.78	1.0	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	28.98
615197	9.5	2.86	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	11.49
533697	13.5	1.00	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	10.45
284874	3.0	0.71	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	2.26
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
47343	33.5	2.84	1.0	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	31.95
365089	13.5	1.04	1.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	14.15
199581	7.0	3.08	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	8.32
375742	19.5	3.19	1.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	19.64
402867	16.5	1.48	1.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	15.27

	Keterangan	Tingkat Akurasi
0	Training Data	0.917842
1	Testing Data	0.918874