Predictive Modeling

The Predictive modeling use case type can be used for most common machine learning tasks, including classification, regression, multiclass classification, multilabel classification and others.

Here you can find a detailed guide on how to train predictive models using the Abacus.AI Python SDK.

Step 1: Create Dummy data

Your data should be a single Feature group that has all of the features that are required for training.

If you already have data in the platform, then you can skip the creation of the dummy data. Please use the existing code as a guideline and alter it as required for your use case.

    import pandas as pd
    import numpy as np
    np.random.seed(42)
    
    # Create 1000 rows of dummy data
    n_rows = 1000
    
    data = {
        # ID and Timestamp columns (to be ignored)
        'property_id': [f'PROP_{i:04d}' for i in range(n_rows)],
        'listing_date': pd.date_range(start='2022-01-01', periods=n_rows).astype(str),
        
        # Feature columns
        'square_feet': np.random.normal(2000, 500, n_rows).round(),
        'bedrooms': np.random.choice([2, 3, 4, 5], n_rows, p=[0.2, 0.4, 0.3, 0.1]),
        'bathrooms': np.random.choice([1, 2, 2.5, 3, 3.5], n_rows),
        'location_type': np.random.choice(['Urban', 'Suburban', 'Rural'], n_rows),
        'property_age': np.random.randint(0, 50, n_rows),
        
        # Target column (house price)
        'sale_price': np.random.normal(350000, 100000, n_rows).round()
    }
    
    df = pd.DataFrame(data)
    
    # Add some realistic correlations
    df['sale_price'] = (
        df['square_feet'] * 100 +  # bigger houses cost more
        (df['bedrooms'] * 25000) +  # more bedrooms increase price
        (df['bathrooms'] * 15000) +  # more bathrooms increase price
        np.where(df['location_type'] == 'Urban', 50000, 
                np.where(df['location_type'] == 'Suburban', 25000, 0)) -  # urban locations cost more
        (df['property_age'] * 1000) +  # older houses cost less
        np.random.normal(0, 25000, n_rows)  # add some noise
    ).round()

Step 2: Upload Data as a Feature Group

We will now register the data in the platform

# Create Feature Group
client = ApiClient()

# Create Python Feature Group
fg = client.create_feature_group_from_python_function(
    function=create_house_price_data,
    table_name="house_price_prediction_data",
    python_function_bindings=[]
)

print(f"Created Feature Group with ID: {fg.id}")

# Create a Feature Group Version
fgv = client.create_feature_group_version(fg.id)
fgv = fgv.wait_for_materialization()
print(f"Feature Group Version Status: {fgv.status}")

# Add Feature Group to project
client.add_feature_group_to_project(
    feature_group_id=fg.id,
    project_id="your_project_id",
    feature_group_type="CUSTOM_TABLE"
)

Step 3: Set Feature types

Setting feature types is a mandatory steps for all use cases, unless the mapping has already been done automatically. It allows Abacus to know how to use the different features in the feature group.

# Set TARGET mapping
client.set_feature_mapping(
    project_id="your_project_id",
    feature_group_id="your_fg_id",
    feature_name="sale_price",
    feature_mapping="TARGET"
)

# Set IGNORE mappings for ID and timestamp columns
client.set_feature_mapping(
    project_id="your_project_id",
    feature_group_id="your_fg_id",
    feature_name="property_id",
    feature_mapping="IGNORE"
)

client.set_feature_mapping(
    project_id="your_project_id",
    feature_group_id="your_fg_id",
    feature_name="listing_date",
    feature_mapping="IGNORE"
)

# Verify the updated mappings
fg_details = client.describe_project_feature_group(project_id="your_project_id", feature_group_id="your_fg_id")
print("\nUpdated features and their mappings:")
for feature in fg_details.features:
    print(f"{feature.name}: {feature.feature_mapping if hasattr(feature, 'feature_mapping') else 'None'}")

Step 4: Train the Model

Now we can train the model!

from datetime import datetime

# Create training configuration
training_config = RegressionTrainingConfig(
    # Use RMSE (Root Mean Square Error) as the primary objective since it's commonly used for house price prediction
    # and penalizes larger errors more heavily
    objective=RegressionObjective.RMSE,
    
    # Use R-squared as sorting objective to help us understand the proportion of variance explained
    sort_objective=RegressionObjective.R_SQUARED_COEFFICIENT_OF_DETERMINATION,
    
    # Use timestamp-based splitting since we have listing_date
    type_of_split=RegressionTypeOfSplit.TIMESTAMP_BASED,
    timestamp_based_splitting_column="listing_date",
    timestamp_based_splitting_method=RegressionTimeSplitMethod.TEST_SPLIT_PERCENTAGE_BASED,
    test_split=15,  # Use 15% of data for testing
    
    # Enable cross-validation for more robust model evaluation
    k_fold_cross_validation=True,
    num_cv_folds=5,
    
    # Since house prices can have outliers, use numeric clipping
    numeric_clipping_percentile=0.99,
    
    # Enable feature selection to identify most important features
    perform_feature_selection=True,
    feature_selection_intensity=50,  # Moderate feature selection intensity
    
    # Use Huber loss which is more robust to outliers than MSE
    loss_function=RegressionLossFunction.HUBER,
    
    # Enable thorough HPO mode for better model performance
    tree_hpo_mode=RegressionTreeHPOMode.THOROUGH
)

# Create and train the model
client = ApiClient()
model = client.train_model(
    project_id="your_project_id",
    name=f"House Price Prediction Model - {datetime.now().strftime('%Y-%m-%d')}",
    training_config=training_config,
    feature_group_ids=["your_fg_df"]  # Our house price prediction Feature Group
)

print(f"Created model with ID: {model.id}")

# Wait for training to complete
model = model.wait_for_full_automl()

# Get the latest model version details
model_version = client.describe_model_version(model.latest_model_version.id)

print(f"\nModel Version Status: {model_version.status}")
print("\nBest Algorithm Details:")
print(f"Algorithm: {model_version.best_algorithm['name']}")
print("\nMetrics:")
for metric, value in model_version.best_algorithm['metrics'].items():
    print(f"{metric}: {value}")

Step 5: Deploy the model

Now we can deploy the model!

# Create deployment (use this code after model training is complete)
deployment = client.create_deployment(
    name="House Price Prediction Service",
    model_id="model_id",
    start=True
)

# Wait for deployment to be ready
deployment = deployment.wait_for_deployment()

Step 1: Create Dummy data​

Step 2: Upload Data as a Feature Group​

Step 3: Set Feature types​

Step 4: Train the Model​

Step 5: Deploy the model​

Step 1: Create Dummy data

Step 2: Upload Data as a Feature Group

Step 3: Set Feature types

Step 4: Train the Model

Step 5: Deploy the model