Predictive Modeling
The Predictive modeling use case type can be used for most common machine learning tasks, including classification, regression, multiclass classification, multilabel classification and others.
Here you can find a detailed guide on how to train predictive models using the Abacus.AI Python SDK.
Step 1: Create Dummy data​
Your data should be a single Feature group that has all of the features that are required for training.
If you already have data in the platform, then you can skip the creation of the dummy data. Please use the existing code as a guideline and alter it as required for your use case.
import pandas as pd
import numpy as np
np.random.seed(42)
# Create 1000 rows of dummy data
n_rows = 1000
data = {
# ID and Timestamp columns (to be ignored)
'property_id': [f'PROP_{i:04d}' for i in range(n_rows)],
'listing_date': pd.date_range(start='2022-01-01', periods=n_rows).astype(str),
# Feature columns
'square_feet': np.random.normal(2000, 500, n_rows).round(),
'bedrooms': np.random.choice([2, 3, 4, 5], n_rows, p=[0.2, 0.4, 0.3, 0.1]),
'bathrooms': np.random.choice([1, 2, 2.5, 3, 3.5], n_rows),
'location_type': np.random.choice(['Urban', 'Suburban', 'Rural'], n_rows),
'property_age': np.random.randint(0, 50, n_rows),
# Target column (house price)
'sale_price': np.random.normal(350000, 100000, n_rows).round()
}
df = pd.DataFrame(data)
# Add some realistic correlations
df['sale_price'] = (
df['square_feet'] * 100 + # bigger houses cost more
(df['bedrooms'] * 25000) + # more bedrooms increase price
(df['bathrooms'] * 15000) + # more bathrooms increase price
np.where(df['location_type'] == 'Urban', 50000,
np.where(df['location_type'] == 'Suburban', 25000, 0)) - # urban locations cost more
(df['property_age'] * 1000) + # older houses cost less
np.random.normal(0, 25000, n_rows) # add some noise
).round()
Step 2: Upload Data as a Feature Group​
We will now register the data in the platform
# Create Feature Group
client = ApiClient()
# Create Python Feature Group
fg = client.create_feature_group_from_python_function(
function=create_house_price_data,
table_name="house_price_prediction_data",
python_function_bindings=[]
)
print(f"Created Feature Group with ID: {fg.id}")
# Create a Feature Group Version
fgv = client.create_feature_group_version(fg.id)
fgv = fgv.wait_for_materialization()
print(f"Feature Group Version Status: {fgv.status}")
# Add Feature Group to project
client.add_feature_group_to_project(
feature_group_id=fg.id,
project_id="your_project_id",
feature_group_type="CUSTOM_TABLE"
)
Step 3: Set Feature types​
Setting feature types is a mandatory steps for all use cases, unless the mapping has already been done automatically. It allows Abacus to know how to use the different features in the feature group.
# Set TARGET mapping
client.set_feature_mapping(
project_id="your_project_id",
feature_group_id="your_fg_id",
feature_name="sale_price",
feature_mapping="TARGET"
)
# Set IGNORE mappings for ID and timestamp columns
client.set_feature_mapping(
project_id="your_project_id",
feature_group_id="your_fg_id",
feature_name="property_id",
feature_mapping="IGNORE"
)
client.set_feature_mapping(
project_id="your_project_id",
feature_group_id="your_fg_id",
feature_name="listing_date",
feature_mapping="IGNORE"
)
# Verify the updated mappings
fg_details = client.describe_project_feature_group(project_id="your_project_id", feature_group_id="your_fg_id")
print("\nUpdated features and their mappings:")
for feature in fg_details.features:
print(f"{feature.name}: {feature.feature_mapping if hasattr(feature, 'feature_mapping') else 'None'}")
Step 4: Train the Model​
Now we can train the model!
from datetime import datetime
# Create training configuration
training_config = RegressionTrainingConfig(
# Use RMSE (Root Mean Square Error) as the primary objective since it's commonly used for house price prediction
# and penalizes larger errors more heavily
objective=RegressionObjective.RMSE,
# Use R-squared as sorting objective to help us understand the proportion of variance explained
sort_objective=RegressionObjective.R_SQUARED_COEFFICIENT_OF_DETERMINATION,
# Use timestamp-based splitting since we have listing_date
type_of_split=RegressionTypeOfSplit.TIMESTAMP_BASED,
timestamp_based_splitting_column="listing_date",
timestamp_based_splitting_method=RegressionTimeSplitMethod.TEST_SPLIT_PERCENTAGE_BASED,
test_split=15, # Use 15% of data for testing
# Enable cross-validation for more robust model evaluation
k_fold_cross_validation=True,
num_cv_folds=5,
# Since house prices can have outliers, use numeric clipping
numeric_clipping_percentile=0.99,
# Enable feature selection to identify most important features
perform_feature_selection=True,
feature_selection_intensity=50, # Moderate feature selection intensity
# Use Huber loss which is more robust to outliers than MSE
loss_function=RegressionLossFunction.HUBER,
# Enable thorough HPO mode for better model performance
tree_hpo_mode=RegressionTreeHPOMode.THOROUGH
)
# Create and train the model
client = ApiClient()
model = client.train_model(
project_id="your_project_id",
name=f"House Price Prediction Model - {datetime.now().strftime('%Y-%m-%d')}",
training_config=training_config,
feature_group_ids=["your_fg_df"] # Our house price prediction Feature Group
)
print(f"Created model with ID: {model.id}")
# Wait for training to complete
model = model.wait_for_full_automl()
# Get the latest model version details
model_version = client.describe_model_version(model.latest_model_version.id)
print(f"\nModel Version Status: {model_version.status}")
print("\nBest Algorithm Details:")
print(f"Algorithm: {model_version.best_algorithm['name']}")
print("\nMetrics:")
for metric, value in model_version.best_algorithm['metrics'].items():
print(f"{metric}: {value}")
Step 5: Deploy the model​
Now we can deploy the model!
# Create deployment (use this code after model training is complete)
deployment = client.create_deployment(
name="House Price Prediction Service",
model_id="model_id",
start=True
)
# Wait for deployment to be ready
deployment = deployment.wait_for_deployment()