Skip to content

Data To Features

API Classes

For most users the API Classes will provide all the general functionality to create a full AWS ML Pipeline

DataToFeaturesLight: Base Class for Light DataSource to FeatureSet using Pandas

DataToFeaturesLight

Bases: Transform

DataToFeaturesLight: Base Class for Light DataSource to FeatureSet using Pandas

Common Usage
to_features = DataToFeaturesLight(data_uuid, feature_uuid)
to_features.set_output_tags(["abalone", "public", "whatever"])
to_features.transform(target_column="target"/None, id_column="id"/None,
                      event_time_column="date"/None, query=str/None)
Source code in src/sageworks/core/transforms/data_to_features/light/data_to_features_light.py
class DataToFeaturesLight(Transform):
    """DataToFeaturesLight: Base Class for Light DataSource to FeatureSet using Pandas

    Common Usage:
        ```
        to_features = DataToFeaturesLight(data_uuid, feature_uuid)
        to_features.set_output_tags(["abalone", "public", "whatever"])
        to_features.transform(target_column="target"/None, id_column="id"/None,
                              event_time_column="date"/None, query=str/None)
        ```
    """

    def __init__(self, data_uuid: str, feature_uuid: str):
        """DataToFeaturesLight Initialization

        Args:
            data_uuid (str): The UUID of the SageWorks DataSource to be transformed
            feature_uuid (str): The UUID of the SageWorks FeatureSet to be created
        """

        # Call superclass init
        super().__init__(data_uuid, feature_uuid)

        # Set up all my instance attributes
        self.input_type = TransformInput.DATA_SOURCE
        self.output_type = TransformOutput.FEATURE_SET
        self.input_df = None
        self.output_df = None

    def pre_transform(self, query: str = None, **kwargs):
        """Pull the input DataSource into our Input Pandas DataFrame
        Args:
            query(str): Optional query to filter the input DataFrame
        """

        # Grab the Input (Data Source)
        data_to_pandas = DataToPandas(self.input_uuid)
        data_to_pandas.transform(query=query)
        self.input_df = data_to_pandas.get_output()

    def transform_impl(self, **kwargs):
        """Transform the input DataFrame into a Feature Set"""

        # This is a reference implementation that should be overridden by the subclass
        self.output_df = self.input_df

    def post_transform(self, target_column=None, id_column=None, event_time_column=None, auto_one_hot=False, **kwargs):
        """At this point the output DataFrame should be populated, so publish it as a Feature Set
        Args:
            target_column(str): The name of the target column in the output DataFrame (default: None)
            id_column(str): The name of the id column in the output DataFrame (default: None)
            event_time_column(str): The name of the event time column in the output DataFrame (default: None)
            auto_one_hot(bool): Automatically one-hot encode categorical columns (default: False)
        """
        # Now publish to the output location
        output_features = PandasToFeatures(self.output_uuid, auto_one_hot=auto_one_hot)
        output_features.set_input(
            self.output_df, target_column=target_column, id_column=id_column, event_time_column=event_time_column
        )
        output_features.set_output_tags(self.output_tags)
        output_features.add_output_meta(self.output_meta)
        output_features.transform()

        # Spin up the FeatureSet and onboard
        fs = FeatureSetCore(self.output_uuid, force_refresh=True)
        fs.onboard()

__init__(data_uuid, feature_uuid)

DataToFeaturesLight Initialization

Parameters:

Name Type Description Default
data_uuid str

The UUID of the SageWorks DataSource to be transformed

required
feature_uuid str

The UUID of the SageWorks FeatureSet to be created

required
Source code in src/sageworks/core/transforms/data_to_features/light/data_to_features_light.py
def __init__(self, data_uuid: str, feature_uuid: str):
    """DataToFeaturesLight Initialization

    Args:
        data_uuid (str): The UUID of the SageWorks DataSource to be transformed
        feature_uuid (str): The UUID of the SageWorks FeatureSet to be created
    """

    # Call superclass init
    super().__init__(data_uuid, feature_uuid)

    # Set up all my instance attributes
    self.input_type = TransformInput.DATA_SOURCE
    self.output_type = TransformOutput.FEATURE_SET
    self.input_df = None
    self.output_df = None

post_transform(target_column=None, id_column=None, event_time_column=None, auto_one_hot=False, **kwargs)

At this point the output DataFrame should be populated, so publish it as a Feature Set Args: target_column(str): The name of the target column in the output DataFrame (default: None) id_column(str): The name of the id column in the output DataFrame (default: None) event_time_column(str): The name of the event time column in the output DataFrame (default: None) auto_one_hot(bool): Automatically one-hot encode categorical columns (default: False)

Source code in src/sageworks/core/transforms/data_to_features/light/data_to_features_light.py
def post_transform(self, target_column=None, id_column=None, event_time_column=None, auto_one_hot=False, **kwargs):
    """At this point the output DataFrame should be populated, so publish it as a Feature Set
    Args:
        target_column(str): The name of the target column in the output DataFrame (default: None)
        id_column(str): The name of the id column in the output DataFrame (default: None)
        event_time_column(str): The name of the event time column in the output DataFrame (default: None)
        auto_one_hot(bool): Automatically one-hot encode categorical columns (default: False)
    """
    # Now publish to the output location
    output_features = PandasToFeatures(self.output_uuid, auto_one_hot=auto_one_hot)
    output_features.set_input(
        self.output_df, target_column=target_column, id_column=id_column, event_time_column=event_time_column
    )
    output_features.set_output_tags(self.output_tags)
    output_features.add_output_meta(self.output_meta)
    output_features.transform()

    # Spin up the FeatureSet and onboard
    fs = FeatureSetCore(self.output_uuid, force_refresh=True)
    fs.onboard()

pre_transform(query=None, **kwargs)

Pull the input DataSource into our Input Pandas DataFrame Args: query(str): Optional query to filter the input DataFrame

Source code in src/sageworks/core/transforms/data_to_features/light/data_to_features_light.py
def pre_transform(self, query: str = None, **kwargs):
    """Pull the input DataSource into our Input Pandas DataFrame
    Args:
        query(str): Optional query to filter the input DataFrame
    """

    # Grab the Input (Data Source)
    data_to_pandas = DataToPandas(self.input_uuid)
    data_to_pandas.transform(query=query)
    self.input_df = data_to_pandas.get_output()

transform_impl(**kwargs)

Transform the input DataFrame into a Feature Set

Source code in src/sageworks/core/transforms/data_to_features/light/data_to_features_light.py
def transform_impl(self, **kwargs):
    """Transform the input DataFrame into a Feature Set"""

    # This is a reference implementation that should be overridden by the subclass
    self.output_df = self.input_df

MolecularDescriptors: Compute a Feature Set based on RDKit Descriptors

MolecularDescriptors

Bases: DataToFeaturesLight

MolecularDescriptors: Create a FeatureSet (RDKit Descriptors) from a DataSource

Common Usage
to_features = MolecularDescriptors(data_uuid, feature_uuid)
to_features.set_output_tags(["aqsol", "whatever"])
to_features.transform()
Source code in src/sageworks/core/transforms/data_to_features/light/molecular_descriptors.py
class MolecularDescriptors(DataToFeaturesLight):
    """MolecularDescriptors: Create a FeatureSet (RDKit Descriptors) from a DataSource

    Common Usage:
        ```
        to_features = MolecularDescriptors(data_uuid, feature_uuid)
        to_features.set_output_tags(["aqsol", "whatever"])
        to_features.transform()
        ```
    """

    def __init__(self, data_uuid: str, feature_uuid: str):
        """MolecularDescriptors Initialization

        Args:
            data_uuid (str): The UUID of the SageWorks DataSource to be transformed
            feature_uuid (str): The UUID of the SageWorks FeatureSet to be created
        """

        # Call superclass init
        super().__init__(data_uuid, feature_uuid)

        # Turn off warnings for RDKIT (revisit this)
        RDLogger.DisableLog("rdApp.*")

    def transform_impl(self, **kwargs):
        """Compute a Feature Set based on RDKit Descriptors"""

        # Check the input DataFrame has the required columns
        if "smiles" not in self.input_df.columns:
            raise ValueError("Input DataFrame must have a 'smiles' column")

        # There are certain smiles that cause Mordred to crash
        # We'll replace them with 'equivalent' smiles (these need to be verified)
        self.input_df["smiles"] = self.input_df["smiles"].replace(
            "[O-]C([O-])=O.[NH4+]CCO.[NH4+]CCO", "[O]C([O])=O.[N]CCO.[N]CCO"
        )
        self.input_df["smiles"] = self.input_df["smiles"].replace(
            "[NH4+]CCO.[NH4+]CCO.[O-]C([O-])=O", "[N]CCO.[N]CCO.[O]C([O])=O"
        )
        self.input_df["smiles"] = self.input_df["smiles"].replace(
            "O=S(=O)(Nn1c-nnc1)C1=CC=CC=C1", "O=S(=O)(NN(C=N1)C=N1)C(C=CC1)=CC=1"
        )

        # Compute/add all the Molecular Descriptors
        self.output_df = self.compute_molecular_descriptors(self.input_df)

        # Get the columns that are descriptors
        desc_columns = set(self.output_df.columns) - set(self.input_df.columns)

        # Drop any NaNs (and INFs)
        current_rows = self.output_df.shape[0]
        self.output_df = pandas_utils.drop_nans(self.output_df, how="any", subset=desc_columns)
        self.log.warning(f"Dropped {current_rows - self.output_df.shape[0]} NaN rows")

    def compute_molecular_descriptors(self, process_df: pd.DataFrame) -> pd.DataFrame:
        """Compute and add all the Molecular Descriptors
        Args:
            process_df(pd.DataFrame): The DataFrame to process and generate RDKit Descriptors
        Returns:
            pd.DataFrame: The input DataFrame with all the RDKit Descriptors added
        """
        self.log.important("Computing Molecular Descriptors...")

        # Conversion to Molecules
        molecules = [Chem.MolFromSmiles(smile) for smile in process_df["smiles"]]

        # Now get all the RDKIT Descriptors
        all_descriptors = [x[0] for x in Descriptors._descList]

        # There's an overflow issue that happens with the IPC descriptor, so we'll remove it
        # See: https://github.com/rdkit/rdkit/issues/1527
        if "Ipc" in all_descriptors:
            all_descriptors.remove("Ipc")

        # Make sure we don't have duplicates
        all_descriptors = list(set(all_descriptors))

        # Super useful Molecular Descriptor Calculator Class
        calc = MoleculeDescriptors.MolecularDescriptorCalculator(all_descriptors)
        column_names = calc.GetDescriptorNames()
        descriptor_values = [calc.CalcDescriptors(m) for m in molecules]
        rdkit_features_df = pd.DataFrame(descriptor_values, columns=column_names)

        # Now compute Mordred Features
        descriptor_choice = [AcidBase, Aromatic, Polarizability, RotatableBond]
        calc = Calculator()
        for des in descriptor_choice:
            calc.register(des)
        mordred_df = calc.pandas(molecules, nproc=1)

        # Return the DataFrame with the RDKit and Mordred Descriptors added
        return pd.concat([process_df, rdkit_features_df, mordred_df], axis=1)

__init__(data_uuid, feature_uuid)

MolecularDescriptors Initialization

Parameters:

Name Type Description Default
data_uuid str

The UUID of the SageWorks DataSource to be transformed

required
feature_uuid str

The UUID of the SageWorks FeatureSet to be created

required
Source code in src/sageworks/core/transforms/data_to_features/light/molecular_descriptors.py
def __init__(self, data_uuid: str, feature_uuid: str):
    """MolecularDescriptors Initialization

    Args:
        data_uuid (str): The UUID of the SageWorks DataSource to be transformed
        feature_uuid (str): The UUID of the SageWorks FeatureSet to be created
    """

    # Call superclass init
    super().__init__(data_uuid, feature_uuid)

    # Turn off warnings for RDKIT (revisit this)
    RDLogger.DisableLog("rdApp.*")

compute_molecular_descriptors(process_df)

Compute and add all the Molecular Descriptors Args: process_df(pd.DataFrame): The DataFrame to process and generate RDKit Descriptors Returns: pd.DataFrame: The input DataFrame with all the RDKit Descriptors added

Source code in src/sageworks/core/transforms/data_to_features/light/molecular_descriptors.py
def compute_molecular_descriptors(self, process_df: pd.DataFrame) -> pd.DataFrame:
    """Compute and add all the Molecular Descriptors
    Args:
        process_df(pd.DataFrame): The DataFrame to process and generate RDKit Descriptors
    Returns:
        pd.DataFrame: The input DataFrame with all the RDKit Descriptors added
    """
    self.log.important("Computing Molecular Descriptors...")

    # Conversion to Molecules
    molecules = [Chem.MolFromSmiles(smile) for smile in process_df["smiles"]]

    # Now get all the RDKIT Descriptors
    all_descriptors = [x[0] for x in Descriptors._descList]

    # There's an overflow issue that happens with the IPC descriptor, so we'll remove it
    # See: https://github.com/rdkit/rdkit/issues/1527
    if "Ipc" in all_descriptors:
        all_descriptors.remove("Ipc")

    # Make sure we don't have duplicates
    all_descriptors = list(set(all_descriptors))

    # Super useful Molecular Descriptor Calculator Class
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(all_descriptors)
    column_names = calc.GetDescriptorNames()
    descriptor_values = [calc.CalcDescriptors(m) for m in molecules]
    rdkit_features_df = pd.DataFrame(descriptor_values, columns=column_names)

    # Now compute Mordred Features
    descriptor_choice = [AcidBase, Aromatic, Polarizability, RotatableBond]
    calc = Calculator()
    for des in descriptor_choice:
        calc.register(des)
    mordred_df = calc.pandas(molecules, nproc=1)

    # Return the DataFrame with the RDKit and Mordred Descriptors added
    return pd.concat([process_df, rdkit_features_df, mordred_df], axis=1)

transform_impl(**kwargs)

Compute a Feature Set based on RDKit Descriptors

Source code in src/sageworks/core/transforms/data_to_features/light/molecular_descriptors.py
def transform_impl(self, **kwargs):
    """Compute a Feature Set based on RDKit Descriptors"""

    # Check the input DataFrame has the required columns
    if "smiles" not in self.input_df.columns:
        raise ValueError("Input DataFrame must have a 'smiles' column")

    # There are certain smiles that cause Mordred to crash
    # We'll replace them with 'equivalent' smiles (these need to be verified)
    self.input_df["smiles"] = self.input_df["smiles"].replace(
        "[O-]C([O-])=O.[NH4+]CCO.[NH4+]CCO", "[O]C([O])=O.[N]CCO.[N]CCO"
    )
    self.input_df["smiles"] = self.input_df["smiles"].replace(
        "[NH4+]CCO.[NH4+]CCO.[O-]C([O-])=O", "[N]CCO.[N]CCO.[O]C([O])=O"
    )
    self.input_df["smiles"] = self.input_df["smiles"].replace(
        "O=S(=O)(Nn1c-nnc1)C1=CC=CC=C1", "O=S(=O)(NN(C=N1)C=N1)C(C=CC1)=CC=1"
    )

    # Compute/add all the Molecular Descriptors
    self.output_df = self.compute_molecular_descriptors(self.input_df)

    # Get the columns that are descriptors
    desc_columns = set(self.output_df.columns) - set(self.input_df.columns)

    # Drop any NaNs (and INFs)
    current_rows = self.output_df.shape[0]
    self.output_df = pandas_utils.drop_nans(self.output_df, how="any", subset=desc_columns)
    self.log.warning(f"Dropped {current_rows - self.output_df.shape[0]} NaN rows")