Tuesday, September 26, 2023
HomeArtificial IntelligenceCollection de Tiempo Forecasting con XGBoost

Collection de Tiempo Forecasting con XGBoost


Pronosticando la producción de energía photo voltaic con collection de tiempo y el modelo XGBoost

liz west, CC BY 2.0, by way of Wikimedia Commons
TELUS Spark Science Centre en Calgary, Canadá.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

df = pd.read_csv('./../knowledge/Solar_Energy_Production.csv')

site_install = df.groupby('title')['installationDate'].distinctive()
site_install = pd.DataFrame(site_install).sort_values(by='installationDate')

site_install

counts = df.groupby('title')['kWh'].sum()
site_totals = pd.DataFrame(counts).sort_values(by='kWh', ascending=True)

site_totals.plot(figsize=(15, 6), sort='barh', legend=False)
plt.title('Calgary Photo voltaic Energy Technology per Website [september 2015 to march 2023]')
plt.xlabel('GigaWatt-hours')
plt.ylabel('Website')

plt.present()

#convertir a datetime
df['date'] = pd.to_datetime(df['date'])
df_pw = df.drop(columns= ['name', 'id', 'address', 'public_url', 'installationDate', 'uid'])
df_pw = df_pw.set_index('date')

#crear acumulado diario
count_date = df_pw.groupby(df_pw.index.date)['kWh'].sum()
pw_clean = pd.DataFrame(count_date)
pw_clean['date'] = pd.to_datetime(pw_clean.index)
pw_clean = pw_clean.set_index('date')

pw_clean.plot(fashion='-', figsize=(20, 7), lw=1,
title='Calgary Photo voltaic Energy Manufacturing in kWh')
plt.present()
Observamos la marcada variación estacional, con altas en los meses de verano y bajas en el invierno.
def create_attributes(df):
df = df.copy()
df['day'] = df.index.day
df['dayofweek'] = df.index.dayofweek
df['month'] = df.index.month
df['quarter'] = df.index.quarter
df['year'] = df.index.12 months
df['dayofyear'] = df.index.dayofyear
return df

pw_clean = create_attributes(pw_clean)

Dataframe con atributos
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error

#dividir en 4 folds
ts_cv = TimeSeriesSplit(n_splits=4)

fold = 0
preds = []
scores = []

for train_idx, val_idx in ts_cv.cut up(pw_clean):
prepare = pw_clean.iloc[train_idx]
take a look at = pw_clean.iloc[val_idx]

#agregar atributos al set de entrenamiento y prueba
prepare = create_attributes(prepare)
take a look at = create_attributes(take a look at)

options = ['day','dayofweek','month','quarter','year','dayofyear']
goal = ['kWh']

#dividir atributos y variable de salida
X_train = prepare[features]
y_train = prepare[target]

X_test = take a look at[features]
y_test = take a look at[target]

#crear instancia del regresor
xgb_reg = xgb.XGBRegressor(booster='gbtree',
seed=42,
n_estimators=1000,
early_stopping_rounds=50,
goal='reg:squarederror',
reg_lambda=0.001,
max_depth=5,
eta=0.01)
#entrenar modelo
xgb_reg.match(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
verbose=100)

#predicciones y evaluacion
y_pred = xgb_reg.predict(X_test)
preds.append(y_pred)
rating = np.sqrt(mean_squared_error(y_test, y_pred))
scores.append(rating)

print('Fold scores:', scores)
print('Avg. Rating:', np.imply(scores))
Fold scores: [2598.514863235386, 2435.9595990180537, 2117.653168183066, 1949.2793308876999]
Avg. Rating: 2275.3517403310516
fig, axs = plt.subplots(4, 1, figsize=(15, 15), sharex=True)

fold = 0
for train_idx, val_idx in ts_cv.cut up(pw_clean):
prepare = pw_clean.iloc[train_idx]
take a look at = pw_clean.iloc[val_idx]
prepare['kWh'].plot(ax=axs[fold], lw=1,
label='Coaching Set',
title=f'Prepare/Check Break up Fold {fold}')
take a look at['kWh'].plot(ax=axs[fold], lw=1,
label='Check Set')
axs[fold].axvline(take a look at.index.min(), coloration='black', ls='--')
fold += 1
plt.present()

#crear dataframe completo
pw_clean = create_attributes(pw_clean)

options = ['day','dayofweek','month','quarter','year','dayofyear']
goal = ['kWh']

X_full = pw_clean[features]
y_full = pw_clean[target]

#crear instancia del regresor
xgb_regf = xgb.XGBRegressor(booster='gbtree',
seed=42,
n_estimators=1000,
early_stopping_rounds=50,
goal='reg:squarederror',
reg_lambda=0.001,
max_depth=5,
eta=0.01)
#entrenar modelo
xgb_regf.match(X_full, y_full,
eval_set=[(X_full, y_full)],
verbose=100)
#rango de marzo a septiembre 2023
pred_dates = pd.date_range('2023-03-16','2023-09-17', freq='D')
preds_df = pd.DataFrame(index=pred_dates)

#Crear columna
preds_df['Future'] = True
pw_clean['Future'] = False

#Concatenar dataframes
pred_pw = pd.concat([pw_clean.loc[pw_clean.index >= '01-01-2018'], preds_df])

#agregar atributos  
pred_pw = pred_pw.copy()
pred_pw = create_attributes(pred_pw)

#seleccionar fechas
future_pred_pw = pred_pw.question('Future').copy()
future_pred_pw.head()

Dataframe “futuro”
#predicciones del modelo
future_pred_pw['prediction'] = xgb_regf.predict(future_pred_pw[features])

#graficar
ax = pw_clean['kWh'].loc[pw_clean.index >= '01-01-2018']
.plot(figsize=(20, 6), lw=1.5, title='Calgary Photo voltaic Energy Manufacturing in kWh')
future_pred_pw['prediction'].plot(fashion='-', lw=1.5)
ax.axvline('2023-03-16', coloration='black', ls='--', lw=1.5)
plt.legend(['Historic Data','Predictions'], fontsize=14)
plt.present()

Pronóstico para los siguientes 6 meses.
RELATED ARTICLES

LEAVE A REPLY

Please enter your comment!
Please enter your name here

- Advertisment -

Most Popular

Recent Comments