因此,由于人们的帮助,我在代码中解决了一个问题,但是由于存在的日期我遇到了一个关键错误,这使我感到困惑。
This is the line where the error comes from.
feature_matrix.loc["2019-06-04"]
The line is near the end under the comment #Predicting
.
附言由于我使用的是自动打印的Google colab,因此某些格式很奇怪
完整的代码如下所示:
data_urls = {
"2014": "https://sitewebbixi.s3.amazonaws.com/uploads/docs/biximontrealrentals2014-f040e0.zip",
"2015": "https://sitewebbixi.s3.amazonaws.com/uploads/docs/biximontrealrentals2015-69fdf0.zip",
"2016": "https://sitewebbixi.s3.amazonaws.com/uploads/docs/biximontrealrentals2016-912f00.zip",
"2017": "https://sitewebbixi.s3.amazonaws.com/uploads/docs/biximontrealrentals2017-d4d086.zip",
"2018": "https://sitewebbixi.s3.amazonaws.com/uploads/docs/biximontrealrentals2018-96034e.zip",
"2019": "https://sitewebbixi.s3.amazonaws.com/uploads/docs/biximontrealrentals2019-33ea73.zip",
}
"""# Load the data"""
"""
Load all the data
We will need:
- the requests package to surf the web,
- the io package to read the data stream from the response,
- the zipfile package to manipulate the archive.
"""
import io
import pandas as pd
import requests
import zipfile
df = None
for year, url in data_urls.items():
print("Processing {}".format(year))
# Load the url
response = requests.get(url)
# Read the archive from the response
archive = zipfile.ZipFile(io.BytesIO(response.content))
# Loop over all the files in the archive
for file in archive.namelist():
# Check that we are looking at one of the files we want
if not archive.getinfo(file).is_dir() and "Station" not in file:
print("Loading data from: {}".format(file))
# We will load the start_date column only to save on memory use
try:
current_length = len(df)
df = df.append(
pd.read_csv(archive.open(file), usecols=["start_date"]),
ignore_index=True,
)
except:
current_length = 0
df = pd.read_csv(archive.open(file), usecols=["start_date"])
print(" > {} rows processed".format(len(df) - current_length))
response.close()
"""# Convert to datetime"""
#This converts our dataframe to datetime
df["start_date"] = pd.to_datetime(df["start_date"])
df.describe()
df.info()
"""#Adding values that will help with the project"""
#This adds values for each date (without the hour)
df["date"] = df["start_date"].dt.date
#This adds values for each year from 2014 to 2019 to the dataframe
df["year"] = df["start_date"].dt.year
#This adds values for each day of the week to the dataframe
df["dayoftheweek"] = df["start_date"].dt.dayofweek
#This adds values for each day of the year to the dataframe
df["dayoftheyear"] = df["start_date"].dt.dayofyear
#This adds values for each month to the dataframe
df["month"] = df["start_date"].dt.month
#This adds values for each week of the year to the dataframe
df["week"] = df["start_date"].dt.week
#This adds values for each hour (in a day) to the dataframe
df["houroftheday"] = df["start_date"].dt.hour
"""# Visualizations to see the trends"""
#Get simple dataframes for each type of data in order to graph them
year_df = df["year"].value_counts(normalize=True).sort_index()
dayoftheweek_df = df["dayoftheweek"].value_counts(normalize=True).sort_index()
dayoftheyear_df = df["dayoftheyear"].value_counts(normalize=True).sort_index()
week_df = df["week"].value_counts(normalize=True).sort_index()
month_df = df["month"].value_counts(normalize=True).sort_index()
houroftheday_df = df["houroftheday"].value_counts(normalize=True).sort_index()
date_df = df["date"].value_counts(normalize=True).sort_index()
"""##Trends for each **year**"""
import matplotlib.pyplot as plt
plt.figure(figsize=[16,9], dpi=300)
plt.bar(year_df.index, year_df.values)
plt.show()
"""Usage goes higher each year, so that needs to be taken into account.
##Trends for each **day of the week**
"""
plt.figure(figsize=[16,9], dpi=300)
plt.bar(dayoftheweek_df.index, dayoftheweek_df.values)
plt.show()
"""In the graph above, 0 stands for Monday, 1 for Tuesday, so on and so forth.<br> So, BIXI usage is smaller during Saturday and Sunday. That seems to be a factor.
##Trends for each **month**
"""
plt.figure(figsize=[16,9], dpi=300)
plt.bar(month_df.index, month_df.values)
plt.show()
"""The trend is lower in the months where BIXI ends/starts and when it gets colder. Usage peaks in summer.
##Trends for each **day of the year**
"""
plt.figure(figsize=[16,9], dpi=300)
plt.bar(dayoftheyear_df.index, dayoftheyear_df.values)
plt.show()
"""The trend is similar to the months except, that on certain weekends (it's every seven days) we have a few drops.
## Trends per date
"""
plt.figure(figsize=[16,9], dpi=300)
plt.bar(date_df.index, date_df.values)
plt.show()
"""This is essentially a combination of the trend per year and the trend per day of the year together, but it uses the actual dates.
##Trends for each **week**
"""
plt.figure(figsize=[16,9], dpi=300)
plt.bar(week_df.index, week_df.values)
plt.show()
"""This trend is pretty much the monthly trend but with weeks (we see how the changes in usage happen in more detail)
## Trends for each **hour**
"""
plt.figure(figsize=[16,9], dpi=300)
plt.bar(houroftheday_df.index, houroftheday_df.values)
plt.show()
"""As we can see here, there's more bixi usage around rush hour periods (8 am & 3-6 pm), so we need to take that into account, since it is an important factor.
#Building the model
To build the model, we first need to set the target vector, then create the feature matrix and finally initializing the model that will make our predictions.
##Target vector
"""
target_df = df.groupby("date").size()
target_df
"""## Feature matrix
Here we will create the feature matrix
"""
#This needs to be done in order to be able to get the month name for the feature matrix
date_df["date"] = pd.to_datetime(df["date"])
date_df.describe
"""###Feature #1 : Day of the week"""
feature_df = pd.get_dummies(df.groupby("date").first(), columns=["dayoftheweek"], prefix="", prefix_sep="").loc[:,["0", "1", "2", "3", "4", "5", "6"]]
feature_df
"""###Feature #2 : Month of the year"""
for i, month in enumerate(pd.Series(feature_df.index).map(lambda x:pd._libs.tslibs.timestamps.Timestamp(x)).dt.month_name().unique(), 3):
x = (pd.Series(pd.Series(feature_df.index).map(lambda x:pd._libs.tslibs.timestamps.Timestamp(x)).dt.month == (i+1))).astype(int)
x.index = feature_df.index
feature_df[month]=x
feature_df
"""##Dropping redundant columns"""
#This removes Monday and April, they will serve as our baseline
feature_matrix = feature_df.drop(columns=["0", "April"])
"""## Choosing a model
The model will be initialized and then used to make some predictions
"""
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model
model.fit(feature_matrix , target_df)
model.coef_
model.intercept_
parameters = pd.Series(model.coef_, index=feature_matrix.columns)
parameters
"""#Predicting"""
#June 4th 2016 was a Tuesday
feature_matrix.loc["2019-06-04"]
feature_matrix.loc["2019-06-04"].values.reshape(1, -1)
model.predict(feature_matrix.loc["2019-06-04"].values.reshape(1, -1))[0]
"""##Quick plot for the initial features"""
plt.figure(figsize=[21,9], dpi=300)
plt.plot(feature_matrix.index, target_df, feature_matrix.index, model.predict(feature_matrix))
plt.show()
# 2016 only, since it seems to be the most accurate
from matplotlib import dates as mdate
plt.figure(figsize=[21,9], dpi=300)
plt.plot(feature_matrix.index, target_df, feature_matrix.index, model.predict(feature_matrix))
plt.xlim(left=mdate.datestr2num("2016-04-15"), right=mdate.datestr2num("2016-12-31"))
plt.show()
# 2019 only, since it seems to be the least accurate
from matplotlib import dates as mdate
plt.figure(figsize=[21,9], dpi=300)
plt.plot(feature_matrix.index, target_df, feature_matrix.index, model.predict(feature_matrix))
plt.xlim(left=mdate.datestr2num("2019-04-15"), right=mdate.datestr2num("2019-10-31"))
plt.show()
"""## Initial RMSE"""
import numpy as np
from sklearn.utils import resample
np.random.seed(1)
uncertainty = np.std([model.fit(*resample(feature_matrix, target_df)).coef_ for i in range(1000)], 0)
params = pd.Series(uncertainty, index=feature_matrix.columns)
params
"""###Check RMSE for 2016 & 2019
####2019
"""
import math
from sklearn.metrics import mean_squared_error
#Use all data and then check 2019 predictions
error = mean_squared_error(model.predict(feature_matrix.loc[feature_matrix.index.year == 2019]), target_df.loc[target_df.index.year == 2019])
print(math.sqrt(error))
# Retrain while leaving out 2018, and then try to predict 2018
model.fit(feature_matrix.loc[feature_matrix.index.year < 2019], target_df.loc[target_df.index.year < 2019])
error = mean_squared_error(model.predict(feature_matrix.loc[feature_matrix.index.year == 2019]), target_df.loc[target_df.index.year == 2019])
print(math.sqrt(error))
"""####2016"""
import math
from sklearn.metrics import mean_squared_error
#Use all data and then check 2016 predictions
error = mean_squared_error(model.predict(feature_matrix.loc[feature_matrix.index.year == 2016]), target_df.loc[target_df.index.year == 2016])
print(math.sqrt(error))
# Retrain while leaving out 2016, and then try to predict 2016
model.fit(feature_matrix.loc[feature_matrix.index.year < 2016], target_df.loc[target_df.index.year < 2016])
error = mean_squared_error(model.predict(feature_matrix.loc[feature_matrix.index.year == 2016]), target_df.loc[target_df.index.year == 2016])
print(math.sqrt(error))```