Skip to content

Data To Features

API Classes

For most users the API Classes will provide all the general functionality to create a full AWS ML Pipeline

DataToFeaturesLight: Base Class for Light DataSource to FeatureSet using Pandas

DataToFeaturesLight

Bases: Transform

DataToFeaturesLight: Base Class for Light DataSource to FeatureSet using Pandas

Common Usage
to_features = DataToFeaturesLight(data_uuid, feature_uuid)
to_features.set_output_tags(["abalone", "public", "whatever"])
to_features.transform(id_column="id"/None, event_time_column="date"/None, query=str/None)
Source code in src/sageworks/core/transforms/data_to_features/light/data_to_features_light.py
class DataToFeaturesLight(Transform):
    """DataToFeaturesLight: Base Class for Light DataSource to FeatureSet using Pandas

    Common Usage:
        ```python
        to_features = DataToFeaturesLight(data_uuid, feature_uuid)
        to_features.set_output_tags(["abalone", "public", "whatever"])
        to_features.transform(id_column="id"/None, event_time_column="date"/None, query=str/None)
        ```
    """

    def __init__(self, data_uuid: str, feature_uuid: str):
        """DataToFeaturesLight Initialization

        Args:
            data_uuid (str): The UUID of the SageWorks DataSource to be transformed
            feature_uuid (str): The UUID of the SageWorks FeatureSet to be created
        """

        # Call superclass init
        super().__init__(data_uuid, feature_uuid)

        # Set up all my instance attributes
        self.input_type = TransformInput.DATA_SOURCE
        self.output_type = TransformOutput.FEATURE_SET
        self.input_df = None
        self.output_df = None

    def pre_transform(self, query: str = None, **kwargs):
        """Pull the input DataSource into our Input Pandas DataFrame
        Args:
            query(str): Optional query to filter the input DataFrame
        """

        # Grab the Input (Data Source)
        data_to_pandas = DataToPandas(self.input_uuid)
        data_to_pandas.transform(query=query)
        self.input_df = data_to_pandas.get_output()

        # Check if there are any columns that are greater than 64 characters
        for col in self.input_df.columns:
            if len(col) > 64:
                raise ValueError(f"Column name '{col}' > 64 characters. AWS FeatureGroup limits to 64 characters.")

    def transform_impl(self, **kwargs):
        """Transform the input DataFrame into a Feature Set"""

        # This is a reference implementation that should be overridden by the subclass
        self.output_df = self.input_df

    def post_transform(self, id_column, event_time_column=None, one_hot_columns=None, **kwargs):
        """At this point the output DataFrame should be populated, so publish it as a Feature Set

        Args:
            id_column (str): The ID column (must be specified, use "auto" for auto-generated IDs).
            event_time_column (str, optional): The name of the event time column (default: None).
            one_hot_columns (list, optional): The list of columns to one-hot encode (default: None).
        """
        # Now publish to the output location
        output_features = PandasToFeatures(self.output_uuid)
        output_features.set_input(
            self.output_df, id_column=id_column, event_time_column=event_time_column, one_hot_columns=one_hot_columns
        )
        output_features.set_output_tags(self.output_tags)
        output_features.add_output_meta(self.output_meta)
        output_features.transform()

__init__(data_uuid, feature_uuid)

DataToFeaturesLight Initialization

Parameters:

Name Type Description Default
data_uuid str

The UUID of the SageWorks DataSource to be transformed

required
feature_uuid str

The UUID of the SageWorks FeatureSet to be created

required
Source code in src/sageworks/core/transforms/data_to_features/light/data_to_features_light.py
def __init__(self, data_uuid: str, feature_uuid: str):
    """DataToFeaturesLight Initialization

    Args:
        data_uuid (str): The UUID of the SageWorks DataSource to be transformed
        feature_uuid (str): The UUID of the SageWorks FeatureSet to be created
    """

    # Call superclass init
    super().__init__(data_uuid, feature_uuid)

    # Set up all my instance attributes
    self.input_type = TransformInput.DATA_SOURCE
    self.output_type = TransformOutput.FEATURE_SET
    self.input_df = None
    self.output_df = None

post_transform(id_column, event_time_column=None, one_hot_columns=None, **kwargs)

At this point the output DataFrame should be populated, so publish it as a Feature Set

Parameters:

Name Type Description Default
id_column str

The ID column (must be specified, use "auto" for auto-generated IDs).

required
event_time_column str

The name of the event time column (default: None).

None
one_hot_columns list

The list of columns to one-hot encode (default: None).

None
Source code in src/sageworks/core/transforms/data_to_features/light/data_to_features_light.py
def post_transform(self, id_column, event_time_column=None, one_hot_columns=None, **kwargs):
    """At this point the output DataFrame should be populated, so publish it as a Feature Set

    Args:
        id_column (str): The ID column (must be specified, use "auto" for auto-generated IDs).
        event_time_column (str, optional): The name of the event time column (default: None).
        one_hot_columns (list, optional): The list of columns to one-hot encode (default: None).
    """
    # Now publish to the output location
    output_features = PandasToFeatures(self.output_uuid)
    output_features.set_input(
        self.output_df, id_column=id_column, event_time_column=event_time_column, one_hot_columns=one_hot_columns
    )
    output_features.set_output_tags(self.output_tags)
    output_features.add_output_meta(self.output_meta)
    output_features.transform()

pre_transform(query=None, **kwargs)

Pull the input DataSource into our Input Pandas DataFrame Args: query(str): Optional query to filter the input DataFrame

Source code in src/sageworks/core/transforms/data_to_features/light/data_to_features_light.py
def pre_transform(self, query: str = None, **kwargs):
    """Pull the input DataSource into our Input Pandas DataFrame
    Args:
        query(str): Optional query to filter the input DataFrame
    """

    # Grab the Input (Data Source)
    data_to_pandas = DataToPandas(self.input_uuid)
    data_to_pandas.transform(query=query)
    self.input_df = data_to_pandas.get_output()

    # Check if there are any columns that are greater than 64 characters
    for col in self.input_df.columns:
        if len(col) > 64:
            raise ValueError(f"Column name '{col}' > 64 characters. AWS FeatureGroup limits to 64 characters.")

transform_impl(**kwargs)

Transform the input DataFrame into a Feature Set

Source code in src/sageworks/core/transforms/data_to_features/light/data_to_features_light.py
def transform_impl(self, **kwargs):
    """Transform the input DataFrame into a Feature Set"""

    # This is a reference implementation that should be overridden by the subclass
    self.output_df = self.input_df

MolecularDescriptors: Compute a Feature Set based on RDKit Descriptors

An alternative to using this class is to use the compute_molecular_descriptors function directly.

df_features = compute_molecular_descriptors(df) to_features = PandasToFeatures("my_feature_set") to_features.set_input(df_features, id_column="id") to_features.set_output_tags(["blah", "whatever"]) to_features.transform()

MolecularDescriptors

Bases: DataToFeaturesLight

MolecularDescriptors: Create a FeatureSet (RDKit Descriptors) from a DataSource

Common Usage
to_features = MolecularDescriptors(data_uuid, feature_uuid)
to_features.set_output_tags(["aqsol", "whatever"])
to_features.transform()
Source code in src/sageworks/core/transforms/data_to_features/light/molecular_descriptors.py
class MolecularDescriptors(DataToFeaturesLight):
    """MolecularDescriptors: Create a FeatureSet (RDKit Descriptors) from a DataSource

    Common Usage:
        ```python
        to_features = MolecularDescriptors(data_uuid, feature_uuid)
        to_features.set_output_tags(["aqsol", "whatever"])
        to_features.transform()
        ```
    """

    def __init__(self, data_uuid: str, feature_uuid: str):
        """MolecularDescriptors Initialization

        Args:
            data_uuid (str): The UUID of the SageWorks DataSource to be transformed
            feature_uuid (str): The UUID of the SageWorks FeatureSet to be created
        """

        # Call superclass init
        super().__init__(data_uuid, feature_uuid)

    def transform_impl(self, **kwargs):
        """Compute a Feature Set based on RDKit Descriptors"""

        # Compute/add all the Molecular Descriptors
        self.output_df = compute_molecular_descriptors(self.input_df)

__init__(data_uuid, feature_uuid)

MolecularDescriptors Initialization

Parameters:

Name Type Description Default
data_uuid str

The UUID of the SageWorks DataSource to be transformed

required
feature_uuid str

The UUID of the SageWorks FeatureSet to be created

required
Source code in src/sageworks/core/transforms/data_to_features/light/molecular_descriptors.py
def __init__(self, data_uuid: str, feature_uuid: str):
    """MolecularDescriptors Initialization

    Args:
        data_uuid (str): The UUID of the SageWorks DataSource to be transformed
        feature_uuid (str): The UUID of the SageWorks FeatureSet to be created
    """

    # Call superclass init
    super().__init__(data_uuid, feature_uuid)

transform_impl(**kwargs)

Compute a Feature Set based on RDKit Descriptors

Source code in src/sageworks/core/transforms/data_to_features/light/molecular_descriptors.py
def transform_impl(self, **kwargs):
    """Compute a Feature Set based on RDKit Descriptors"""

    # Compute/add all the Molecular Descriptors
    self.output_df = compute_molecular_descriptors(self.input_df)