Feature Engineering Basics

What is Feature Engineering?

Creating new features from existing data to improve model performance.

Good features = Better predictions!

Handling Categorical Data

Label Encoding

Convert categories to numbers:

code.pyPython

from sklearn.preprocessing import LabelEncoder

colors = ['red', 'blue', 'green', 'red', 'blue']

le = LabelEncoder()
encoded = le.fit_transform(colors)
print(encoded)  # [2, 0, 1, 2, 0]

Problem: Model may think blue(0) < green(1) < red(2)

One-Hot Encoding

Create binary columns:

code.pyPython

from sklearn.preprocessing import OneHotEncoder
import pandas as pd

df = pd.DataFrame({'color': ['red', 'blue', 'green', 'red']})

# Method 1: pandas
encoded = pd.get_dummies(df, columns=['color'])
print(encoded)

# Method 2: sklearn
encoder = OneHotEncoder(sparse_output=False)
encoded = encoder.fit_transform(df[['color']])
print(encoded)

Feature Scaling

Standardization (Z-score)

Mean=0, Std=1:

code.pyPython

from sklearn.preprocessing import StandardScaler

data = [[100, 0.5], [200, 1.0], [150, 0.8]]

scaler = StandardScaler()
scaled = scaler.fit_transform(data)
print(scaled)

Use for: Most algorithms (SVM, Neural Networks, KNN)

Min-Max Scaling

Scale to range [0, 1]:

code.pyPython

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled = scaler.fit_transform(data)
print(scaled)

Use for: Neural networks, image data

Creating New Features

Mathematical Combinations

code.pyPython

import pandas as pd

df = pd.DataFrame({
    'length': [10, 20, 15],
    'width': [5, 8, 6]
})

# New features
df['area'] = df['length'] * df['width']
df['perimeter'] = 2 * (df['length'] + df['width'])
df['ratio'] = df['length'] / df['width']

print(df)

Binning (Discretization)

Convert continuous to categorical:

code.pyPython

import pandas as pd

df = pd.DataFrame({'age': [22, 35, 45, 18, 60, 28]})

# Create age groups
df['age_group'] = pd.cut(df['age'],
                         bins=[0, 25, 40, 60, 100],
                         labels=['Young', 'Adult', 'Middle', 'Senior'])
print(df)

Polynomial Features

code.pyPython

from sklearn.preprocessing import PolynomialFeatures
import numpy as np

X = np.array([[2, 3], [4, 5]])

poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

print(f"Original: {X.shape}")  # (2, 2)
print(f"Polynomial: {X_poly.shape}")  # (2, 5)
# Includes: x1, x2, x1², x1*x2, x2²

Handling Missing Values

code.pyPython

from sklearn.impute import SimpleImputer
import numpy as np

data = [[1, 2], [np.nan, 3], [7, np.nan]]

# Mean imputation
imputer = SimpleImputer(strategy='mean')
filled = imputer.fit_transform(data)
print(filled)

# Other strategies: 'median', 'most_frequent', 'constant'

Date Features

code.pyPython

import pandas as pd

df = pd.DataFrame({
    'date': pd.to_datetime(['2024-01-15', '2024-06-20', '2024-12-25'])
})

# Extract features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['dayofweek'] = df['date'].dt.dayofweek  # 0=Monday
df['is_weekend'] = df['date'].dt.dayofweek >= 5

print(df)

Text Features

code.pyPython

import pandas as pd

df = pd.DataFrame({
    'text': ['Hello world', 'Machine learning is fun', 'Python']
})

# Basic features
df['word_count'] = df['text'].str.split().str.len()
df['char_count'] = df['text'].str.len()
df['avg_word_length'] = df['char_count'] / df['word_count']

print(df)

Feature Selection

Remove irrelevant features:

code.pyPython

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.datasets import load_iris

iris = load_iris()
X, y = iris.data, iris.target

# Select top 2 features
selector = SelectKBest(f_classif, k=2)
X_new = selector.fit_transform(X, y)

print(f"Original: {X.shape}")  # (150, 4)
print(f"Selected: {X_new.shape}")  # (150, 2)

# Which features were selected?
print(f"Selected features: {selector.get_support()}")

Using Pipelines

Combine all preprocessing:

code.pyPython

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

# Fit everything at once
pipeline.fit(X_train, y_train)

# Predict
predictions = pipeline.predict(X_test)

Complete Example

code.pyPython

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Sample data
df = pd.DataFrame({
    'age': [25, 35, 45, np.nan, 30, 55],
    'income': [50000, 80000, 120000, 60000, 70000, 150000],
    'city': ['NYC', 'LA', 'NYC', 'Chicago', 'LA', 'NYC'],
    'purchased': [0, 1, 1, 0, 0, 1]
})

# Feature engineering
df['income_per_year'] = df['income'] / df['age']

# Separate features and target
X = df[['age', 'income', 'city']]
y = df['purchased']

# Define transformers
numeric_features = ['age', 'income']
categorical_features = ['city']

preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# Full pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Split and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
pipeline.fit(X_train, y_train)

print(f"Accuracy: {pipeline.score(X_test, y_test):.0%}")

Key Points

One-hot encode categorical features
Scale numerical features
Create new features from existing ones
Handle missing values before training
Extract features from dates and text
Use pipelines to combine all steps
Select only useful features

Module Complete!

You've learned Statistics and Machine Learning basics:

Descriptive statistics
Probability
Hypothesis testing
ML fundamentals
Model evaluation
Cross-validation
Feature engineering

Now you're ready for real data science projects!