mlops.data

feature_engineering(df)

Compute key features of the preprocessed dataset.

Compute features of the df preprocessed dataset: age, years_on_the_job, nb_previous_loans, avg_amount_loans_previous and flag_own_car.

Parameters:
  • df (DataFrame) – Preprocessed dataset.

Returns:
  • DataFrame – Clean dataset.

Source code in mlops/data.py
def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    """
    Compute key features of the preprocessed dataset.

    Compute features of the ``df`` preprocessed dataset:
    ``age``, ``years_on_the_job``, ``nb_previous_loans``,
    ``avg_amount_loans_previous`` and ``flag_own_car``.

    Parameters:
        df (DataFrame):
            Preprocessed dataset.

    Returns:
        Clean dataset.
    """
    df = df.copy()
    df_grouped = df.groupby("id")
    df["nb_previous_loans"] = df_grouped["loan_date"].rank(method="first") - 1
    df["avg_amount_loans_previous"] = df.groupby("id")["loan_amount"].apply(
        lambda x: x.shift().expanding().mean()
    )
    df["age"] = (
        pd.to_datetime("today").normalize() - df["birthday"]
    ).dt.days // 365
    df["years_on_the_job"] = (
        pd.to_datetime("today").normalize() - df["job_start_date"]
    ).dt.days // 365
    df["flag_own_car"] = df["flag_own_car"].apply(
        lambda x: 0 if x == "N" else 1
    )
    return df[
        [
            "id",
            "age",
            "years_on_the_job",
            "nb_previous_loans",
            "avg_amount_loans_previous",
            "flag_own_car",
            "status",
        ]
    ]

preprocess(df, inplace=False)

Preprocess the dataset.

Sort dataset by id and loan_date. Convert all dates to datetime objects.

Parameters:
  • df (DataFrame) – Dataset.

  • inplace (bool) – Modify the dataframe in place.

Returns:
  • DataFrame – Preprocessed dataset or None if inplace=True.

Source code in mlops/data.py
def preprocess(df: pd.DataFrame, inplace: bool = False) -> pd.DataFrame:
    """
    Preprocess the dataset.

    Sort dataset by ``id`` and ``loan_date``.
    Convert all dates to ``datetime`` objects.

    Parameters:
        df (DataFrame):
            Dataset.
        inplace (bool, optional):
            Modify the dataframe in place.

    Returns:
        Preprocessed dataset or ``None`` if ``inplace=True``.
    """
    if not inplace:
        df = df.copy()
    df.sort_values(by=["id", "loan_date"], inplace=True)
    df.reset_index(drop=True, inplace=True)
    df = _to_datetime(
        df,
        columns=["loan_date", "birthday", "job_start_date"],
        inplace=inplace,
    )
    if not inplace:
        return df