Multidisciplinary field that involves extracting insights and knowledge from data through various processes, techniques, and tools
Umbrella term for a more comprehensive set of fields that are focused on mining big data sets and discovering innovative new insights, trends, methods, and processes
Essential step for setting a clear direction for the data science project
By thoroughly understanding the business context, defining specific goals, identifying key questions, and assessing feasibility, you lay a solid foundation for the subsequent stages of data collection, analysis, and modeling
A well-defined problem ensures that the project stays focused, resources are used efficiently, and the final outcomes are aligned with the business objectives
Detailed breakdown of the Problem Definition step:
Foundational step for the success of a data science project
By meticulously identifying data requirements, leveraging appropriate sources, ensuring data quality, and securely storing the data, you set the stage for accurate and reliable analysis
Detailed breakdown of the Data Collection step:
# IMPORT PYTHON PACKAGES
# Data Science and Analysis
import numpy as np # Numerical operations
import pandas as pd # Data manipulation and analysis
import scipy # Scientific computing
import matplotlib.pyplot as plt # Plotting and visualization
import seaborn as sns # Statistical data visualization
import plotly.express as px # Interactive data visualization
import statsmodels.api as sm # Statistical modeling
import dask.dataframe as dd # Parallel computing with Pandas-like DataFrames
import vaex # Out-of-core DataFrames for big data
# Machine Learning and AI
from sklearn import datasets, model_selection, preprocessing, metrics # Machine learning tools
from sklearn import ensemble, decomposition, compose # Machine learning tools
import tensorflow as tf # Deep learning framework
from tensorflow import keras # High-level neural networks API
import torch # Deep learning framework
import xgboost as xgb # Gradient boosting library
import lightgbm as lgb # Light Gradient Boosting Machine
import catboost # Categorical features gradient boosting library
# Natural Language Processing (NLP)
import nltk # Natural language processing toolkit
import spacy # Advanced natural language processing
import gensim # Topic modeling and document similarity
import transformers # State-of-the-art NLP models
from sklearn.feature_extraction.text import TfidfVectorizer # Text feature extraction
import textblob # Text processing and NLP
import nltk.sentiment.vader as vader # Sentiment analysis
# Data Visualization
from bokeh.plotting import figure, output_file, show # Interactive visualization
import altair as alt # Declarative statistical visualization
import folium # Interactive maps
import geopandas as gpd # Geospatial data processing
# Web Development
from django.http import HttpResponse # Django web framework
from flask import Flask, request # Flask web framework
import fastapi # FastAPI web framework
# General-Purpose Libraries
import requests # HTTP requests
from bs4 import BeautifulSoup # Web scraping
from PIL import Image # Image processing
import json # JSON data manipulation
import os # Operating system interfaces
import sys # System-specific parameters and functions
import re # Regular expressions
import datetime # Date and time manipulation
import logging # Logging facility for Python
import itertools # Functions creating iterators for efficient looping
import collections # Container datatypes
# Data Storage and Databases
import sqlite3 # SQLite database
import pymongo # MongoDB
import sqlalchemy # SQL toolkit and Object Relational Mapper
import psycopg2 # PostgreSQL database adapter
# Networking
import socket # Low-level networking interface
import paramiko # SSH protocol
import smtplib # Simple Mail Transfer Protocol
pandas.read_csv()
- function to read the csv file
header = None
inside the read_csv() method so that pandas will not
automatically set the first row as a header
dtype
- data type's to apply to either the whole dataset or individual columns
coerce_float
- attempt to force numbers into floatsparse_dates
- list of columns to parse as dateschunksize
- number of rows to include in each chunkpd.read_excel()
- read excel filepd.read_json()
- read json file
# READING CSV FILES:
pd.set_option('display.max_columns', None) # Show all available columns
path = "../1 datasets/iris_data.csv" # Set the path to your file
data_df = pd.read_csv(path) # Read the file (import the data)
data_df = pd.read_csv(path, sep="\t") # Different delimeters: tab separated file(.tsv)
data_df = pd.read_csv(path, delim_whitespace=True) # Different delimeters: space separated file
data_df = pd.read_csv(path, header=None) # Don't use first row for column names
data_df = pd.read_csv(path, names=["Name1", "Name2"]) # Specify column names
data_df = pd.read_csv(path, na_values=["NA", 99]) # Custom missing values
# READING SQL DATA:
import sqlite3 as sq3 # Import sqlite3
path = "database path" # Set the path to your database
con = sq3.Connection(path) # Create connection SQL database with sqlite3
query = '''SELECT * FROM table_name;'''
data_df = pd.read_sql(query, con) # Execute query
observations_generator = pds.read_sql(query, con,
coerce_float=True,
parse_dates=['Release_Year'], # Parse `Release_Year` as a date
chunksize=5 # Allows for streaming results as a series of shorter tables)
# READING noSQL DATA:
from pymongo import MongoClient
con = MongoClinet() # Create a Mongo connection
db = con.database_name # Choose database (con.list_database_names() will display available databases)
#cursor = db.collection_name.find(query) # Create a cursor object using a query (replace query with {} to sellect all)
#data_df = pd.DataFrame(list(cursor)) # Expand cursor and construct DataFrame
dataframe.columns = headers
to replace the headers with the list we
created
# CREATE HEADERS:
headers = ["header_name1","header_name2"..."header_namen"]
data_df.columns = headers # Replace headers
# RENAME COLUMNS OR INDEXES USING A MAPPING:
data_df.rename(columns={"REF_DATE": "DATE", "Type of fuel": "TYPE"})
data_df.rename(index={0: "x", 1: "y", 2: "z"})
data_df.rename(str.lower, axis='columns') # Lowercase the columns using axis-style parameters
data_df.rename({1: 2, 2: 4}, axis='index') # Rename indexes using axis-style parameters
df.to_csv()
- save the dataset to csvdf.to_json()
- save the dataset to jsondf.to_excel()
- save the dataset to exceldf.to_hdf()
- save the dataset to hdfdf.to_sql()
- save the dataset to sql
# SAVE DATAFRAME:
DataFrame.to_csv("data_df.csv", index=False) # index=False means the row names will not be written
pandas.df.head()
- return the first n
rowspandas.df.shape()
- shows how many
entries there are in our dataset (parameter 0 for rows,
parameter 1 for columns)pandas.df.info()
- provides a concise
summary of your DataFrame (prints information about a DataFrame including
the index dtype and columns, non-null values and memory usage)
pandas.df.dtypes()
- returns a Series with the
data type of each column
pandas.df.value_counts()
- returns a Series containing
counts of unique values (resulting object will be in descending order)
normalize
- If True then the object returned will contain the relative frequencies of the unique valuessort
- sort by frequencies when Trueascending
- sort in ascending order (default False)pandas.tolist()
- return a list of the
valuespandas.df.describe()
- provides a generate
descriptive statistics of each numeric-typed (int, float) column, excluding NaN
(Not a Number) valuesinclude = "all"
argument provides the statistical summary of all the
columns, including object-typed attributesmax - min
# SHOW FEW ROWS
data_df.head(5) # Show the first 5 rows of the dataframe
print(data_df.iloc[:5]) # Print a few rows
# SHOW NUMBER OF ROWS AND COLUMNS
print("There are: ", data_df.shape[0], " rows; ", data_df.shape[1], " columns")
# INFO
data_df.info()
# DATA TYPES OF THE COLUMNS
print(data_df.dtypes) # Data types; included in info()
data_df.dtypes.value_counts() # Counts columns according to data types
# UNIQUE VALUES
data_df[column].value_counts() # Counts unique values
percentages = data_df['column'].value_counts(normalize=True) * 100 # Calculate percentages of value counts
# COLUMN NAMES
print(data_df.columns.tolist()) # Column names; included in info()
# DESCRIBE
data_df.describe()
data_df.describe(include = "all") # All the columns
data_df[['length','width','height']].describe() # For selected columns
data_df.describe(include=['object']) # To include type object
stats_df = data_df.describe() # Get statistical summary
stats_df.loc['range'] = stats_df.loc['max'] - stats_df.loc['min'] # Calculate range (max-min)
out_fields = ['mean','25%','50%','75%','range'] # Select just the rows desired from the 'describe' method
stats_df = stats_df.loc[out_fields]
stats_df.rename({'50%': 'median'}, inplace=True) # Add name median instead of 50%
An iterative and interactive process that requires careful attention to detail
Sets the foundation for all subsequent steps in the data science process, as the quality and structure of the data directly impact the accuracy and reliability of the insights derived from it
Effective data preparation ensures that the data is clean, relevant, and ready for analysis, ultimately leading to better decision-making and more robust models
Detailed breakdown of the Data Preparation step:
pandas.df.isnull()
- detect
missing values (None
or numpy.NaN). Empty strings " " are
not considered NA valuespandas.df.dropna()
- remove
missing valuespandas.df.replace()
- replace
values given in to_replace with value. .replace(A, B, inplace = True)
by
mean, frequency, otherpandas.df.fillna()
- fill
NA/NaN
values using the specified method
pandas.df.astype()
- convert data
types into a proper format for each column
# IDENTIFY AND HANDLE MISSING VALUES:
data_df.isnull().sum().sort_values(ascending=False) # Sort missing values in our dataset
missing_data = data_df.isnull() # False means there is no missing data in column
for column in missing_data.columns.values.tolist():
print(column)
print (missing_data[column].value_counts())
print("")
data_df.dropna(inplace = True) # Delete NaN
data_df.dropna(how='all') # Drop the rows where all elements are missing
data_df.dropna(axis='columns') # Drop the columns where at least one element is missing
data_df.dropna(subset=["Lot Frontage"]) # Define in which columns to look for missing values
data_df.drop("Lot Frontage", axis=1) # Drop the whole attribute (column) if contains missing values
data_df.price = data_df.price.replace('?',np.nan) # Replace ? sign with NaN
data_df['brand'] = data_df['brand'].replace(['vw', 'vokswagen'], 'volkswagen') # Fixing typos in the names of the cars
median = data_df["Lot Frontage"].median()
data_df["Lot Frontage"].fillna(median, inplace = True) # Replace the missing values with the median value of that column
mean = data_df["Mas Vnr Area"].mean()
data_df["Mas Vnr Area"].fillna(mean, inplace = True) # Replace the missing values with the mean value of that column
data_df.price = data_df.price.astype('int64') # Convert into the type int
pandas.df.duplicated()
- check whether
there are any duplicates in our datapandas.df.drop_duplicates()
- removes
all duplicate rows based on all the columns by defaultpandas.Index.is_unique()
- alternative
way to check if there are any duplicated Indexes in our dataset
# HANDLING DUPLICATES:
duplicate = data_df[data_df.duplicated(['PID'])] # To check whether there are any duplicates in the column
sum(data_df.duplicated(subset = 'car_ID')) == 0 # To find and sum duplicates on specific column
dup_removed = data_df.drop_duplicates() # To remove the duplicates
removed_sub = data_df.drop_duplicates(subset=['Order']) # Remove duplicates on a specific column
data_df.index.is_unique # To check if there are any duplicated Indexes in our dataset
data_df.brand.nunique() # Count number of distinct elements in specified axis
pandas.str.split()
- splits the string
recordspat
- parameter can be used to split by other characters (if not specified,
split on whitespace)n
- limit number of splits in output (0 and -1 will be interpreted as
return all splits)expand=True
- returns a dataframe
# SPLITTING THE COLUMNS:
data_df[['City', 'Province']] = data_df['GEO'].str.split(',', n=1, expand=True) # Split GEO column to City and Province columns
data_df['brand'] = data_df.CarName.str.split(' ').str.get(0).str.lower() # Get all first words of car names in lowercase
pandas.to_datetime()
- transforms to
date time formatformat='%b-%y'
means that it will split into the name of a month and year
str.slice(stop=3)
splits and outputs the first 3 letters of a month
# CHANGING TO DATETIME FORMAT:
data_df['DATE'] = pd.to_datetime(data_df['DATE'], format='%b-%y')
data_df['Month'] = data_df['DATE'].dt.month_name().str.slice(stop=3)
data_df['Year'] = data_df['DATE'].dt.year
#
duration = list(data_df['Duration'])
for i in range(len(duration)):
if len(duration[i].split()) != 2:
if 'h' in duration[i]:
duration[i] = duration[i].strip() + ' 0m'
elif 'm' in duration[i] :
duration[i] = '0h {}'.format(duration[i].strip())
dur_hours = []
dur_minutes = []
for i in range(len(duration)) :
dur_hours.append(int(duration[i].split()[0][:-1]))
dur_minutes.append(int(duration[i].split()[1][:-1]))
data_df['Duration_hours'] = dur_hours
data_df['Duration_minutes'] =dur_minutes
data_df.loc[:,'Duration_hours'] *= 60
data_df['Duration_Total_mins']= data_df['Duration_hours']+data_df['Duration_minutes']
data_df["Dep_Hour"]= pd.to_datetime(data_df['Dep_Time']).dt.hour
data_df["Dep_Min"]= pd.to_datetime(data_df['Dep_Time']).dt.minute
data_df["Arrival_Hour"]= pd.to_datetime(data_df['Arrival_Time']).dt.hour
data_df["Arrival_Min"]= pd.to_datetime(data_df['Arrival_Time']).dt.minute
data_df['Month']= pd.to_datetime(data_df["Date_of_Journey"], format="%d/%m/%Y").dt.month
data_df['Day']= pd.to_datetime(data_df["Date_of_Journey"], format="%d/%m/%Y").dt.day
data_df['Year']= pd.to_datetime(data_df["Date_of_Journey"], format="%d/%m/%Y").dt.year
data_df['day_of_week'] = pd.to_datetime(data_df['Date_of_Journey']).dt.day_name()
In statistics, an outlier is an observation point that is distant from other observations (can be due to some mistakes in data collection or recording, or due to natural high variability of data points)
Outliers can markedly affect our models and can be a valuable source of information, providing us insights about specific behaviours
Z-score
- signed number of standard deviations
by
which the value of an observation or data point is above the mean value of what is
being
observed or measured
data_df['LQFSF_Stats'] = scipy.stats.zscore(data_df['Low Qual Fin SF'])
data_df[['Low Qual Fin SF','LQFSF_Stats']].describe().round(3)
# Detecting outliers: Statistics
# Calculate the interquartile range (Q1: 25th percentile and Q3: 75th percentile)
Q1, Q50, Q3 = np.percentile(data_df.SalePrice, [25, 50, 75])
IQR = Q3 - Q1
# Define lower and upper bounds
lower_bound = Q1 - 1.5*(IQR)
upper_bound = Q3 + 1.5*(IQR)
print("min:", lower_bound, "Q1:", Q1, "q50:", q50, "Q3:", Q3, "max:", upper_bound)
# Identify outliers
outliers = data_df[(data_df['SalePrice'] < lower_bound) | (data_df['SalePrice'] > upper_bound)]
print("Outliers:\n", outliers)
# Identify the points
[x for x in data_df["SalePrice"] if x > upper_bound]
# Remove outliers
df_cleaned = data_df[(data_df['SalePrice'] >= lower_bound) & (data_df['SalePrice'] <= upper_bound)]
print("Data after removing outliers:\n", df_cleaned)
sns.boxplot(x=data_df['Lot Area'])
price_area = data_df.plot.scatter(x='Gr Liv Area', y='SalePrice')
data_df.sort_values(by = 'Gr Liv Area', ascending = False)[:2] # Sort values to find last 2 records (index: 1499,2181)
outliers_dropped = data_df.drop(data_df.index[[1499,2181]]) # If we want to delete some of the outliers
We can use the logical operators on column values to filter rows. First, we specify the name of
our data, then, square brackets to select the name of the column, double 'equal' sign,
==
to select the name of a row group, in single or double quotation marks
If we want to exclude some entries (e.g. some locations), we would use the 'equal' and
'exclamation point' signs together, =!
. We can also use < >
,
<= >=
signs to select numeric information. We can also use |
(or) and
&
(and) to select multiple columns and rows
calgary = data_df[data_df['GEO']=='Calgary, Alberta'] # select the Calgary, Alberta data
sel_years = data_df[data_df['Year']==2000] # select 2000 year
mult_loc = data_df[(data_df['GEO']=="Toronto, Ontario") | (data_df['GEO']=="Edmonton, Alberta")] # Select Toronto and Edmonton locations
cities = ['Calgary', 'Toronto', 'Edmonton']
CTE = data_df[data_df.City.isin(cities)] # isin method to select multiple locations
mult_sel = data_df[(data_df['Year']==1990) & (data_df['TYPE']=="Household heating fuel") & (data_df['City']=='Vancouver')] # Select the data that shows the price of the 'household heating fuel', in Vancouver, in 1990
mult_sel = data_df[( data_df['Year']<=1979) | ( data_df['Year']==2021) & (data_df['TYPE']=="Household heating fuel") & (data_df['City']=='Vancouver')] # Select the data that shows the price of the 'household heating fuel', in Vancouver, in the years of 1979 and 2021
Involves examining and visualizing the dataset to uncover patterns, spot anomalies, test hypotheses, and check assumptions
Helps in understanding the underlying structure of the data, guiding the next steps in the analysis or modeling process
Allows us to get an initial feel for the data, lets us to determine if the data makes sense (or if futher cleaning or more data is needed), helps to identify patterns and trends in the data
Detailed breakdown of the EDA step:
Correlation is a measure of the extent of interdependence between variables, Causation is the relationship between cause and effect between two variables
Correlation does not imply causation. Determining correlation is much simpler then determining causation as causation may require independent experimentation
pandas.corr()
corr_matrix = data_df.corr(numeric_only=True)
corr_matrix['SalePrice'].sort_values(ascending=False) # List of top features that have high correlation coefficient
hous_num = data_df.select_dtypes(include = ['float64', 'int64']) # Select only float and int data types
hous_num_corr = hous_num.corr()['SalePrice'][:-1] # -1 means that the latest row is SalePrice
top_features = hous_num_corr[abs(hous_num_corr) > 0.5].sort_values(ascending=False) # Displays pearsons correlation coefficient greater than 0.5
print("There is {} strongly correlated values with SalePrice:\n{}".format(len(top_features), top_features))
for i in range(0, len(hous_num.columns), 5):
sns.pairplot(data=hous_num, x_vars=hous_num.columns[i:i+5], y_vars=['SalePrice'])
sns.set_context('talk')
sns.pairplot(data_df, hue='species');
Statistical method used to test whether there are significant differences between the means of two or more groups. ANOVA returns two parameters:
pandas.df.groupby
- groups data by different categories
pandas.df.pivot
- convert the dataframe to a pivot
tablepandas.df.reset_index
- resets the index
# Group by single column
grouped_by_product = data_df.groupby('product')['sales'].sum()
# Group by multiple columns
group_year = data_df.groupby(['Year'])['VALUE'].mean() # Calculate the mean of the prices per year
group_month = data_df.groupby(['Month'])['VALUE'].max() # Group by the maximum value of prices, for each month
group_city = data_df.groupby(['Year', 'City'])['VALUE'].median().reset_index(name ='Value').round(2) # Group by the median value of prices, for each year and each city
# Using multiple aggregation functions
data_df.groupby('species').agg(['mean', 'median']) # Passing a list of recognized strings
data_df.groupby('species').agg([np.mean, np.median]) # Passing a list of explicit aggregation functions
grouped_with_agg = data_df.groupby('product')['sales'].agg(['sum', 'mean', 'count'])
# Using pivot
data_df.pivot(index='col1', columns='col2', values='col3')
# A simple scatter plot of sepal_length vs sepal_width
ax = plt.axes()
ax.scatter(data_df.sepal_length, data_df.sepal_width)
# Label the axes
ax.set(xlabel='Sepal Length (cm)', ylabel='Sepal Width (cm)', title='Sepal Length vs Width');
# A histogram of petal length
ax = plt.axes()
ax.hist(data_df.petal_length, bins=25);
ax.set(xlabel='Petal Length (cm)', ylabel='Frequency', title='Distribution of Petal Lengths');
ax = data_df.petal_length.plot.hist(bins=25) # Alternatively using Pandas plotting functionality
sns.set_context('notebook')
ax = data_df.plot.hist(bins=25, alpha=0.5) # Single plot with histograms for each feature overlayed
ax.set_xlabel('Size (cm)');
# To create four separate plots, use Pandas `.hist` method
axList = data_df.hist(bins=25)
# Add some x- and y- labels to first column and last row
for ax in axList.flatten():
if ax.get_subplotspec().is_last_row():
ax.set_xlabel('Size (cm)')
if ax.get_subplotspec().is_first_col():
ax.set_ylabel('Frequency')
data_df.boxplot(by='species') # Boxplot of each petal and sepal measurement
# Single boxplot where the features are separated in the x-axis and species are colored with different hues
plot_data = (data_df
.set_index('species')
.stack()
.to_frame()
.reset_index()
.rename(columns={0:'size', 'level_1':'measurement'}))
sns.set_style('white')
sns.set_context('notebook')
sns.set_palette('dark')
f = plt.figure(figsize=(6,4))
sns.boxplot(x='measurement', y='size', hue='species', data=plot_data);
Vital step in the data science process that significantly impacts the performance and accuracy of machine learning models
By creating new features, transforming existing ones, selecting the most relevant features, and applying domain knowledge, data scientists can enhance the predictive power of their models
Detailed breakdown of the Feature Engineering step:
numpy.log()
- function that perform log transform
# Log transformation
log_transformed = np.log(data_df['SalePrice'])
# Aggregation
data_df['total_purchases'] = data_df.groupby('customer_id')['purchase_amount'].transform('sum')
# Date and time features
data_df['purchase_date'] = pd.to_datetime(data_df['purchase_date'])
data_df['day_of_week'] = data_df['purchase_date'].dt.dayofweek
data_df['month'] = data_df['purchase_date'].dt.month
MinMaxScaler
from scikit-learn transform features
by scaling each feature to a given rangeStandardScaler
from scikit-learn standardize
features by removing the mean and scaling to unit variancepandas.get_dummies
convert categorical variable
into dummy/indicator variablespandas.cut
bin values into discrete intervals
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
# Normalization
scaler = MinMaxScaler()
data_df['normalized_feature'] = scaler.fit_transform(data_df[['numerical_feature']])
X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
X_scaled = X_std * (max - min) + min
# Standardization
scaler = StandardScaler()
data_df['standardized_feature'] = scaler.fit_transform(data_df[['numerical_feature']])
# z = (x - u) / s
# One-hot encoding
data_df = pd.get_dummies(data=data_df, columns = ['Airline', 'Source', 'Destination'])
# Label encoding
label_encoder = LabelEncoder()
data_df['encoded_feature'] = label_encoder.fit_transform(data_df['categorical_feature'])
# Binning
data_df["Arrival_Hour"]= pd.to_datetime(data_df['Arrival_Time']).dt.hour
data_df['arr_timezone'] = pd.cut(data_df.Arrival_Hour, [0,6,12,18,24], labels=['Night','Morning','Afternoon','Evening'])
data_df['binned_feature'] = pd.cut(data_df['numerical_feature'], bins=5, labels=False)
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.linear_model import LogisticRegression
# Filter method
selector = SelectKBest(chi2, k=10)
selected_features = selector.fit_transform(data_df.drop(columns=['target']), data_df['target'])
# Wrapper method
model = LogisticRegression()
rfe = RFE(model, n_features_to_select=10)
selected_features = rfe.fit_transform(data_df.drop(columns=['target']), data_df['target'])
Where the core of data science work happens, involves selecting the right algorithms, training models, tuning hyperparameters, and evaluating performance to ensure the model meets the defined objectives
By carefully executing each sub-step and iterating based on feedback, data scientists can develop robust models that provide valuable insights and predictions
Detailed breakdown of the Modeling step:
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
# Classification model
model = LogisticRegression()
# Clustering model
kmeans = KMeans(n_clusters=3)
from sklearn.model_selection import train_test_split
# Split the data
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['target']), df['target'], test_size=0.2, random_state=42)
# Train the model
model.fit(X_train, y_train)
from sklearn.model_selection import GridSearchCV
# Define parameter grid
param_grid = {'C': [0.1, 1, 10], 'solver': ['liblinear', 'saga']}
# Grid search
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
# Best parameters
print(grid_search.best_params_)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Predict on test set
y_pred = model.predict(X_test)
# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
# Classification report
print(classification_report(y_test, y_pred))
# Confusion matrix
print(confusion_matrix(y_test, y_pred))
import shap
# SHAP values
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_test)
shap.summary_plot(shap_values, X_test)
from sklearn.model_selection import cross_val_score
# Cross-validation
scores = cross_val_score(model, df.drop(columns=['target']), df['target'], cv=5)
print(f'Cross-Validation Scores: {scores}')
print(f'Mean Score: {scores.mean()}')
Essential for validating the performance and effectiveness of your machine learning models, proper evaluation helps in making informed decisions and improving model reliability and robustness
By using appropriate metrics, performing cross-validation, comparing different models, analyzing errors, and interpreting results, you ensure that the model not only performs well on training data but also generalizes effectively to new, unseen data
Detailed breakdown of the Evaluation step:
# CLASSIFICATION
# Model predictions
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1] # Probabilities for ROC
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
# Precision, Recall, F1 Score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'ROC AUC: {roc_auc}')
# REGRESSION
# Model predictions
y_pred = model.predict(X_test)
# MAE, MSE, RMSE
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
# R-squared
r2 = r2_score(y_test, y_pred)
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R-squared: {r2}')
# Cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy') # for classification
print(f'Cross-Validation Scores: {scores}')
print(f'Mean Score: {scores.mean()}')
# Compare multiple models
from sklearn.model_selection import cross_val_score
model1 = LogisticRegression()
model2 = RandomForestClassifier()
scores1 = cross_val_score(model1, X, y, cv=5)
scores2 = cross_val_score(model2, X, y, cv=5)
print(f'Model 1 Scores: {scores1.mean()}')
print(f'Model 2 Scores: {scores2.mean()}')
# Confusion matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix:\n{conf_matrix}')
# SHAP values
import shap
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_test)
shap.summary_plot(shap_values, X_test)
Ensures that the machine learning model is not only functional but also reliable and secure in a real-world environment
By carefully managing serialization, integration, monitoring, updating, documentation, and security, you can successfully deploy a model that delivers valuable insights and predictions to end-users
Detailed breakdown of the Deployment step:
import pickle
import joblib
# Using pickle
with open('model.pkl', 'wb') as f:
pickle.dump(model, f)
# Using joblib
joblib.dump(model, 'model.joblib')
# Loading the model
with open('model.pkl', 'rb') as f:
loaded_model = pickle.load(f)
# Script to load model and process data
def process_batch(data_batch):
model = joblib.load('model.joblib')
predictions = model.predict(data_batch)
return predictions
# Example of a cron job to run the script daily
# 0 0 * * * /usr/bin/python /path/to/your/script.py
from flask import Flask, request, jsonify
import joblib
app = Flask(__name__)
model = joblib.load('model.joblib')
@app.route('/predict', methods=['POST'])
def predict():
data = request.get_json(force=True)
prediction = model.predict([data['features']])
return jsonify({'prediction': prediction.tolist()})
if __name__ == '__main__':
app.run(debug=True)
from scipy.stats import ks_2samp
# Function to detect data drift
def detect_drift(new_data, reference_data):
p_value = ks_2samp(new_data, reference_data).pvalue
return p_value < 0.05 # If p-value is less than 0.05, data drift is detected
# Monitor data drift
is_drifted = detect_drift(new_data['feature'], reference_data['feature'])
print(f'Data Drift Detected: {is_drifted}')
def retrain_model(new_data, new_labels):
# Load existing model
model = joblib.load('model.joblib')
# Retrain model with new data
model.fit(new_data, new_labels)
# Save the retrained model
joblib.dump(model, 'model.joblib')
# Example of retraining the model
new_data = ... # Load new data
new_labels = ... # Load new labels
retrain_model(new_data, new_labels)
from flask import Flask, request, jsonify
import joblib
import jwt # For authentication tokens
app = Flask(__name__)
model = joblib.load('model.joblib')
def authenticate(token):
# Implement token authentication logic
try:
payload = jwt.decode(token, 'your-secret-key', algorithms=['HS256'])
return payload['user'] == 'authorized_user'
except jwt.ExpiredSignatureError:
return False
except jwt.InvalidTokenError:
return False
@app.route('/predict', methods=['POST'])
def predict():
token = request.headers.get('Authorization')
if not authenticate(token):
return jsonify({'error': 'Unauthorized'}), 401
data = request.get_json(force=True)
prediction = model.predict([data['features']])
return jsonify({'prediction': prediction.tolist()})
if __name__ == '__main__':
app.run(debug=True)
Ensures that the results of the data science process are effectively conveyed to stakeholders in a clear, concise, and actionable manner
By tailoring the message to the audience, using appropriate formats and visualizations, and gathering feedback, data scientists can ensure that their work has a meaningful impact on decision-making and business outcomes
Detailed breakdown of the Communication step:
Ensures that the machine learning model remains effective and relevant over time
By continuously monitoring performance, retraining as necessary, versioning models, ensuring compliance, and keeping stakeholders informed, you can maintain the reliability and accuracy of the model in a dynamic environment
Proper maintenance helps in sustaining the value provided by the model and adapting to changes in data and business needs
Detailed breakdown of the Maintenance step: