Skip to content

Views

View Examples

Examples of using the Views classes to extend the functionality of SageWorks Artifacts are in the Examples section at the bottom of this page.

Views are a powerful way to filter and agument your DataSources and FeatureSets. With Views you can subset columns, rows, and even add data to existing SageWorks Artifacts. If you want to compute outliers, runs some statistics or engineer some new features, Views are an easy way to change, modify, and add to DataSources and FeatureSets.

View: Read from a view (training, display, etc) for DataSources and FeatureSets.

View

View: Read from a view (training, display, etc) for DataSources and FeatureSets.

Common Usage
view = View(DataSource/FeatureSet, "training")
training_df = view.pull_dataframe()

# You can also pull from the DS/FS directly
display_view = ds.view("display")
print(display_view.columns)

# Get a DataFrame for the computation view
comp_view = fs.view("computation")
comp_df = comp_view.pull_dataframe()

# Query the view with a custom SQL query
query = f"SELECT * FROM {comp_view.table} WHERE age > 30"
df = comp_view.query(query)

# Delete the view
comp_view.delete()
Source code in src/sageworks/core/views/view.py
class View:
    """View: Read from a view (training, display, etc) for DataSources and FeatureSets.

    Common Usage:
        ```
        view = View(DataSource/FeatureSet, "training")
        training_df = view.pull_dataframe()

        # You can also pull from the DS/FS directly
        display_view = ds.view("display")
        print(display_view.columns)

        # Get a DataFrame for the computation view
        comp_view = fs.view("computation")
        comp_df = comp_view.pull_dataframe()

        # Query the view with a custom SQL query
        query = f"SELECT * FROM {comp_view.table} WHERE age > 30"
        df = comp_view.query(query)

        # Delete the view
        comp_view.delete()
        ```
    """

    # Class attributes
    log = logging.getLogger("sageworks")
    meta = Meta()

    def __init__(self, artifact: Union[DataSource, FeatureSet], view_name: str, **kwargs):
        """View Constructor: Retrieve a View for the given artifact

        Args:
            artifact (Union[DataSource, FeatureSet]): A DataSource or FeatureSet object
            view_name (str): The name of the view to retrieve (e.g. "training")
        """

        # Set the view name
        self.view_name = view_name

        # Is this a DataSource or a FeatureSet?
        self.is_feature_set = isinstance(artifact, FeatureSetCore)
        self.auto_id_column = artifact.record_id if self.is_feature_set else None

        # Get the data_source from the artifact
        self.artifact_name = artifact.uuid
        self.data_source = artifact.data_source if self.is_feature_set else artifact
        self.database = self.data_source.get_database()

        # Construct our base_table_name
        self.base_table_name = self.data_source.table

        # Check if the view should be auto created
        self.auto_created = False
        if kwargs.get("auto_create_view", True) and not self.exists():

            # A direct double check before we auto-create
            if not self.exists(skip_cache=True):
                self.log.important(
                    f"View {self.view_name} for {self.artifact_name} doesn't exist, attempting to auto-create..."
                )
                self.auto_created = self._auto_create_view()

                # Check for failure of the auto-creation
                if not self.auto_created:
                    self.log.error(
                        f"View {self.view_name} for {self.artifact_name} doesn't exist and cannot be auto-created..."
                    )
                    self.view_name = self.columns = self.column_types = self.source_table = self.base_table_name = None
                    return

        # Now fill some details about the view
        self.columns, self.column_types, self.source_table, self.join_view = view_details(
            self.data_source.get_database(), self.table, self.data_source.boto3_session
        )

    def pull_dataframe(self, limit: int = 50000, head: bool = False) -> Union[pd.DataFrame, None]:
        """Pull a DataFrame based on the view type

        Args:
            limit (int): The maximum number of rows to pull (default: 50000)
            head (bool): Return just the head of the DataFrame (default: False)

        Returns:
            Union[pd.DataFrame, None]: The DataFrame for the view or None if it doesn't exist
        """

        # Pull the DataFrame
        if head:
            limit = 5
        pull_query = f"SELECT * FROM {self.table} LIMIT {limit}"
        df = self.data_source.query(pull_query)
        return df

    def query(self, query: str) -> Union[pd.DataFrame, None]:
        """Query the view with a custom SQL query

        Args:
            query (str): The SQL query to execute

        Returns:
            Union[pd.DataFrame, None]: The DataFrame for the query or None if it doesn't exist
        """
        return self.data_source.query(query)

    def column_details(self) -> dict:
        """Return a dictionary of the column names and types for this view

        Returns:
            dict: A dictionary of the column names and types
        """
        return dict(zip(self.columns, self.column_types))

    @property
    def table(self) -> str:
        """Construct the view table name for the given view type

        Returns:
            str: The view table name
        """
        if self.view_name is None:
            return None
        if self.view_name == "base":
            return self.base_table_name
        return f"{self.base_table_name}_{self.view_name}"

    def delete(self):
        """Delete the database view (and supplemental data) if it exists."""

        # List any supplemental tables for this data source
        supplemental_tables = list_supplemental_data_tables(self.data_source)
        for table in supplemental_tables:
            if self.view_name in table:
                self.log.important(f"Deleting Supplemental Table {table}...")
                delete_table(self.data_source, table)

        # Now drop the view
        self.log.important(f"Dropping View {self.table}...")
        drop_view_query = f"DROP VIEW {self.table}"

        # Execute the DROP VIEW query
        try:
            self.data_source.execute_statement(drop_view_query, silence_errors=True)
        except wr.exceptions.QueryFailed as e:
            if "View not found" in str(e):
                self.log.info(f"View {self.table} not found, this is fine...")
            else:
                raise

        # We want to do a small sleep so that AWS has time to catch up
        self.log.info("Sleeping for 3 seconds after dropping view to allow AWS to catch up...")
        time.sleep(3)

    def exists(self, skip_cache: bool = False) -> bool:
        """Check if the view exists in the database

        Args:
            skip_cache (bool): Skip the cache and check the database directly (default: False)
        Returns:
            bool: True if the view exists, False otherwise.
        """
        # The BaseView always exists
        if self.view_name == "base":
            return True

        # If we're skipping the cache, we need to check the database directly
        if skip_cache:
            return self._check_database()

        # Use the meta class to see if the view exists
        views_df = self.meta.views(self.database)

        # Check if we have ANY views
        if views_df.empty:
            return False

        # Check if the view exists
        return self.table in views_df["Name"].values

    def ensure_exists(self):
        """Ensure if the view exists by making a query directly to the database. If it doesn't exist, create it"""

        # The BaseView always exists
        if self.view_name == "base":
            return True

        # Check the database directly
        if not self._check_database():
            self._auto_create_view()

    def _check_database(self) -> bool:
        """Internal: Check if the view exists in the database

        Returns:
            bool: True if the view exists, False otherwise
        """
        # Query to check if the table/view exists
        check_table_query = f"""
        SELECT table_name
        FROM information_schema.tables
        WHERE table_schema = '{self.database}' AND table_name = '{self.table}'
        """
        _df = self.data_source.query(check_table_query)
        return not _df.empty

    def _auto_create_view(self) -> bool:
        """Internal: Automatically create a view training, display, and computation views

        Returns:
            bool: True if the view was created, False otherwise

        Raises:
            ValueError: If the view type is not supported
        """
        from sageworks.core.views import DisplayView, ComputationView, TrainingView

        # First if we're going to auto-create, we need to make sure the data source exists
        if not self.data_source.exists():
            self.log.error(f"Data Source {self.data_source.uuid} does not exist...")
            return False

        # DisplayView
        if self.view_name == "display":
            self.log.important(f"Auto creating View {self.view_name} for {self.data_source.uuid}...")
            DisplayView.create(self.data_source)
            return True

        # ComputationView
        if self.view_name == "computation":
            self.log.important(f"Auto creating View {self.view_name} for {self.data_source.uuid}...")
            ComputationView.create(self.data_source)
            return True

        # TrainingView
        if self.view_name == "training":
            # We're only going to create training views for FeatureSets
            if self.is_feature_set:
                self.log.important(f"Auto creating View {self.view_name} for {self.data_source.uuid}...")
                TrainingView.create(self.data_source, id_column=self.auto_id_column)
                return True
            else:
                self.log.warning("Training Views are only supported for FeatureSets...")
                return False

        # If we get here, we don't support auto-creating this view
        self.log.warning(f"Auto-Create for {self.view_name} not implemented yet...")
        return False

    def __repr__(self):
        """Return a string representation of this object"""

        # Set up various details that we want to print out
        auto = "(Auto-Created)" if self.auto_created else ""
        artifact = "FeatureSet" if self.is_feature_set else "DataSource"

        info = f'View: "{self.view_name}" for {artifact}("{self.artifact_name}")\n'
        info += f"      Database: {self.database}\n"
        info += f"      Table: {self.table}{auto}\n"
        info += f"      Source Table: {self.source_table}\n"
        info += f"      Join View: {self.join_view}"
        return info

table: str property

Construct the view table name for the given view type

Returns:

Name Type Description
str str

The view table name

__init__(artifact, view_name, **kwargs)

View Constructor: Retrieve a View for the given artifact

Parameters:

Name Type Description Default
artifact Union[DataSource, FeatureSet]

A DataSource or FeatureSet object

required
view_name str

The name of the view to retrieve (e.g. "training")

required
Source code in src/sageworks/core/views/view.py
def __init__(self, artifact: Union[DataSource, FeatureSet], view_name: str, **kwargs):
    """View Constructor: Retrieve a View for the given artifact

    Args:
        artifact (Union[DataSource, FeatureSet]): A DataSource or FeatureSet object
        view_name (str): The name of the view to retrieve (e.g. "training")
    """

    # Set the view name
    self.view_name = view_name

    # Is this a DataSource or a FeatureSet?
    self.is_feature_set = isinstance(artifact, FeatureSetCore)
    self.auto_id_column = artifact.record_id if self.is_feature_set else None

    # Get the data_source from the artifact
    self.artifact_name = artifact.uuid
    self.data_source = artifact.data_source if self.is_feature_set else artifact
    self.database = self.data_source.get_database()

    # Construct our base_table_name
    self.base_table_name = self.data_source.table

    # Check if the view should be auto created
    self.auto_created = False
    if kwargs.get("auto_create_view", True) and not self.exists():

        # A direct double check before we auto-create
        if not self.exists(skip_cache=True):
            self.log.important(
                f"View {self.view_name} for {self.artifact_name} doesn't exist, attempting to auto-create..."
            )
            self.auto_created = self._auto_create_view()

            # Check for failure of the auto-creation
            if not self.auto_created:
                self.log.error(
                    f"View {self.view_name} for {self.artifact_name} doesn't exist and cannot be auto-created..."
                )
                self.view_name = self.columns = self.column_types = self.source_table = self.base_table_name = None
                return

    # Now fill some details about the view
    self.columns, self.column_types, self.source_table, self.join_view = view_details(
        self.data_source.get_database(), self.table, self.data_source.boto3_session
    )

__repr__()

Return a string representation of this object

Source code in src/sageworks/core/views/view.py
def __repr__(self):
    """Return a string representation of this object"""

    # Set up various details that we want to print out
    auto = "(Auto-Created)" if self.auto_created else ""
    artifact = "FeatureSet" if self.is_feature_set else "DataSource"

    info = f'View: "{self.view_name}" for {artifact}("{self.artifact_name}")\n'
    info += f"      Database: {self.database}\n"
    info += f"      Table: {self.table}{auto}\n"
    info += f"      Source Table: {self.source_table}\n"
    info += f"      Join View: {self.join_view}"
    return info

column_details()

Return a dictionary of the column names and types for this view

Returns:

Name Type Description
dict dict

A dictionary of the column names and types

Source code in src/sageworks/core/views/view.py
def column_details(self) -> dict:
    """Return a dictionary of the column names and types for this view

    Returns:
        dict: A dictionary of the column names and types
    """
    return dict(zip(self.columns, self.column_types))

delete()

Delete the database view (and supplemental data) if it exists.

Source code in src/sageworks/core/views/view.py
def delete(self):
    """Delete the database view (and supplemental data) if it exists."""

    # List any supplemental tables for this data source
    supplemental_tables = list_supplemental_data_tables(self.data_source)
    for table in supplemental_tables:
        if self.view_name in table:
            self.log.important(f"Deleting Supplemental Table {table}...")
            delete_table(self.data_source, table)

    # Now drop the view
    self.log.important(f"Dropping View {self.table}...")
    drop_view_query = f"DROP VIEW {self.table}"

    # Execute the DROP VIEW query
    try:
        self.data_source.execute_statement(drop_view_query, silence_errors=True)
    except wr.exceptions.QueryFailed as e:
        if "View not found" in str(e):
            self.log.info(f"View {self.table} not found, this is fine...")
        else:
            raise

    # We want to do a small sleep so that AWS has time to catch up
    self.log.info("Sleeping for 3 seconds after dropping view to allow AWS to catch up...")
    time.sleep(3)

ensure_exists()

Ensure if the view exists by making a query directly to the database. If it doesn't exist, create it

Source code in src/sageworks/core/views/view.py
def ensure_exists(self):
    """Ensure if the view exists by making a query directly to the database. If it doesn't exist, create it"""

    # The BaseView always exists
    if self.view_name == "base":
        return True

    # Check the database directly
    if not self._check_database():
        self._auto_create_view()

exists(skip_cache=False)

Check if the view exists in the database

Parameters:

Name Type Description Default
skip_cache bool

Skip the cache and check the database directly (default: False)

False

Returns: bool: True if the view exists, False otherwise.

Source code in src/sageworks/core/views/view.py
def exists(self, skip_cache: bool = False) -> bool:
    """Check if the view exists in the database

    Args:
        skip_cache (bool): Skip the cache and check the database directly (default: False)
    Returns:
        bool: True if the view exists, False otherwise.
    """
    # The BaseView always exists
    if self.view_name == "base":
        return True

    # If we're skipping the cache, we need to check the database directly
    if skip_cache:
        return self._check_database()

    # Use the meta class to see if the view exists
    views_df = self.meta.views(self.database)

    # Check if we have ANY views
    if views_df.empty:
        return False

    # Check if the view exists
    return self.table in views_df["Name"].values

pull_dataframe(limit=50000, head=False)

Pull a DataFrame based on the view type

Parameters:

Name Type Description Default
limit int

The maximum number of rows to pull (default: 50000)

50000
head bool

Return just the head of the DataFrame (default: False)

False

Returns:

Type Description
Union[DataFrame, None]

Union[pd.DataFrame, None]: The DataFrame for the view or None if it doesn't exist

Source code in src/sageworks/core/views/view.py
def pull_dataframe(self, limit: int = 50000, head: bool = False) -> Union[pd.DataFrame, None]:
    """Pull a DataFrame based on the view type

    Args:
        limit (int): The maximum number of rows to pull (default: 50000)
        head (bool): Return just the head of the DataFrame (default: False)

    Returns:
        Union[pd.DataFrame, None]: The DataFrame for the view or None if it doesn't exist
    """

    # Pull the DataFrame
    if head:
        limit = 5
    pull_query = f"SELECT * FROM {self.table} LIMIT {limit}"
    df = self.data_source.query(pull_query)
    return df

query(query)

Query the view with a custom SQL query

Parameters:

Name Type Description Default
query str

The SQL query to execute

required

Returns:

Type Description
Union[DataFrame, None]

Union[pd.DataFrame, None]: The DataFrame for the query or None if it doesn't exist

Source code in src/sageworks/core/views/view.py
def query(self, query: str) -> Union[pd.DataFrame, None]:
    """Query the view with a custom SQL query

    Args:
        query (str): The SQL query to execute

    Returns:
        Union[pd.DataFrame, None]: The DataFrame for the query or None if it doesn't exist
    """
    return self.data_source.query(query)

Examples

All of the SageWorks Examples are in the Sageworks Repository under the examples/ directory. For a full code listing of any example please visit our SageWorks Examples

Listing Views

views.py
from sageworks.api.data_source import DataSource

# Convert the Data Source to a Feature Set
test_data = DataSource('test_data')
test_data.views()
["display", "training", "computation"]

Getting a Particular View

views.py
from sageworks.api.feature_set import FeatureSet

fs = FeatureSet('test_features')

# Grab the columns for the display view
display_view = fs.view("display")
display_view.columns
['id', 'name', 'height', 'weight', 'salary', ...]

# Pull the dataframe for this view
df = display_view.pull_dataframe()
    id       name     height      weight         salary ...
0   58  Person 58  71.781227  275.088196  162053.140625  

View Queries

All SageWorks Views are stored in AWS Athena, so any query that you can make with Athena is accessible through the View Query API.

view_query.py
from sageworks.api.feature_set import FeatureSet

# Grab a FeatureSet View
fs = FeatureSet("abalone_features")
t_view = fs.view("training")

# Make some queries using the Athena backend
df = t_view(f"select * from {t_view.table} where height > .3")
print(df.head())

df = t_view.query("select * from abalone_features where class_number_of_rings < 3")
print(df.head())

Output

  sex  length  diameter  height  whole_weight  shucked_weight  viscera_weight  shell_weight  class_number_of_rings
0   M   0.705     0.565   0.515         2.210          1.1075          0.4865        0.5120                     10
1   F   0.455     0.355   1.130         0.594          0.3320          0.1160        0.1335                      8

  sex  length  diameter  height  whole_weight  shucked_weight  viscera_weight  shell_weight  class_number_of_rings
0   I   0.075     0.055   0.010         0.002          0.0010          0.0005        0.0015                      1
1   I   0.150     0.100   0.025         0.015          0.0045          0.0040         0.0050                      2

Classes to construct View

The SageWorks Classes used to construct viewss are currently in 'Core'. So you can check out the documentation for those classes here: SageWorks View Creators

The SuperCowPowers team is happy to anser any questions you may have about AWS and SageWorks. Please contact us at sageworks@supercowpowers.com or on chat us up on Discord