import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv("../data/regression.csv")
df.head()

X = df.drop("target", axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape

((400, 6), (100, 6))

model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

y_pred = model.predict(X_test)
y_pred[:5]

array([ 164.05060775, -168.51088955,  -66.93683516,  -35.28712732,
       -261.425874  ])

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 287.58605355048087
R² Score: 0.9888363641525172

coefficients = pd.Series(model.coef_, index=X.columns)
coefficients

feature_0    43.251465
feature_1    98.112448
feature_2    96.219087
feature_3    35.651902
feature_4    81.433923
feature_5    26.081725
dtype: float64

	feature_0	feature_1	feature_2	feature_3	feature_4	feature_5	target
0	-0.753965	0.281191	-0.062593	-0.280675	0.758929	0.104201	15.914852
1	1.031845	-0.439731	0.196555	-1.485560	-0.186872	1.446978	-24.363081
2	-0.600639	0.110923	0.375698	-0.291694	-0.544383	-1.150994	-55.864380
3	0.998311	-0.322320	1.521316	-0.431620	1.615376	1.217159	308.187994
4	0.338496	0.770865	1.143754	-0.415288	0.235615	-1.478586	165.850761

	fit_intercept fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).	True
	copy_X copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.	True
	tol tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7	1e-06
	n_jobs n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.	None
	positive positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24	False

Linear Regression¶

Objective¶

Dataset¶

Model Training¶

Prediction¶

Model Evaluation¶

Model Interpretation¶