Skip to content

Meta

Meta Examples

Examples of using the Meta class are listed at the bottom of this page Examples.

Meta: A class that provides high level information and summaries of SageWorks/AWS Artifacts. The Meta class provides 'meta' information, what account are we in, what is the current configuration, etc. It also provides metadata for AWS Artifacts, such as Data Sources, Feature Sets, Models, and Endpoints.

Meta

Meta: A class that provides Metadata for a broad set of AWS Artifacts

Common Usage:

meta = Meta()
meta.account()
meta.config()
meta.data_sources()

Source code in src/sageworks/api/meta.py
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
class Meta:
    """Meta: A class that provides Metadata for a broad set of AWS Artifacts

    Common Usage:
    ```
    meta = Meta()
    meta.account()
    meta.config()
    meta.data_sources()
    ```
    """

    def __init__(self):
        """Meta Initialization"""
        self.log = logging.getLogger("sageworks")

        # Account and Service Brokers
        self.aws_account_clamp = AWSAccountClamp()
        self.aws_broker = AWSServiceBroker()
        self.cm = ConfigManager()

        # Pipeline Manager
        self.pipeline_manager = PipelineManager()

    def account(self) -> dict:
        """Print out the AWS Account Info

        Returns:
            dict: The AWS Account Info
        """
        return self.aws_account_clamp.get_aws_account_info()

    def config(self) -> dict:
        """Return the current SageWorks Configuration

        Returns:
            dict: The current SageWorks Configuration
        """
        return self.cm.get_all_config()

    def incoming_data(self) -> pd.DataFrame:
        """Get summary data about data in the incoming-data S3 Bucket

        Returns:
            pd.DataFrame: A summary of the data in the incoming-data S3 Bucket
        """
        data = self.incoming_data_deep()
        data_summary = []
        for name, info in data.items():
            # Get the name and the size of the S3 Storage Object(s)
            name = "/".join(name.split("/")[-2:]).replace("incoming-data/", "")
            info["Name"] = name
            size = info.get("ContentLength") / 1_000_000
            summary = {
                "Name": name,
                "Size(MB)": f"{size:.2f}",
                "Modified": datetime_string(info.get("LastModified", "-")),
                "ContentType": str(info.get("ContentType", "-")),
                "ServerSideEncryption": info.get("ServerSideEncryption", "-"),
                "Tags": str(info.get("tags", "-")),
                "_aws_url": aws_url(info, "S3", self.aws_account_clamp),  # Hidden Column
            }
            data_summary.append(summary)

        # Return the summary
        return pd.DataFrame(data_summary)

    def incoming_data_deep(self, refresh: bool = False) -> dict:
        """Get a deeper set of data for the Incoming Data in AWS

        Args:
            refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

        Returns:
            dict: A summary of the Incoming Data in AWS
        """
        return self.aws_broker.get_metadata(ServiceCategory.INCOMING_DATA_S3, force_refresh=refresh)

    def glue_jobs(self) -> pd.DataFrame:
        """Get summary data about AWS Glue Jobs"""
        glue_meta = self.glue_jobs_deep()
        glue_summary = []

        # Get the information about each Glue Job
        for name, info in glue_meta.items():
            summary = {
                "Name": info["Name"],
                "GlueVersion": info["GlueVersion"],
                "Workers": info.get("NumberOfWorkers", "-"),
                "WorkerType": info.get("WorkerType", "-"),
                "Modified": datetime_string(info.get("LastModifiedOn")),
                "LastRun": datetime_string(info["sageworks_meta"]["last_run"]),
                "Status": info["sageworks_meta"]["status"],
                "_aws_url": aws_url(info, "GlueJob", self.aws_account_clamp),  # Hidden Column
            }
            glue_summary.append(summary)

        # Return the summary
        return pd.DataFrame(glue_summary)

    def glue_jobs_deep(self, refresh: bool = False) -> dict:
        """Get a deeper set of data for the Glue Jobs in AWS

        Args:
            refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

        Returns:
            dict: A summary of the Glue Jobs in AWS
        """
        return self.aws_broker.get_metadata(ServiceCategory.GLUE_JOBS, force_refresh=refresh)

    def data_sources(self) -> pd.DataFrame:
        """Get a summary of the Data Sources in AWS

        Returns:
            pd.DataFrame: A summary of the Data Sources in AWS
        """
        data = self.data_sources_deep()
        data_summary = []

        # Pull in various bits of metadata for each data source
        for name, info in data.items():
            summary = {
                "Name": name,
                "Modified": datetime_string(info.get("UpdateTime")),
                "Num Columns": num_columns_ds(info),
                "Tags": info.get("Parameters", {}).get("sageworks_tags", "-"),
                "Input": str(
                    info.get("Parameters", {}).get("sageworks_input", "-"),
                ),
                "_aws_url": aws_url(info, "DataSource", self.aws_account_clamp),  # Hidden Column
            }
            data_summary.append(summary)

        # Return the summary
        return pd.DataFrame(data_summary)

    def data_source_details(
        self, data_source_name: str, database: str = "sageworks", refresh: bool = False
    ) -> Union[dict, None]:
        """Get detailed information about a specific data source in AWS

        Args:
            data_source_name (str): The name of the data source
            database (str, optional): Glue database. Defaults to 'sageworks'.
            refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

        Returns:
            dict: Detailed information about the data source (or None if not found)
        """
        data = self.data_sources_deep(database=database, refresh=refresh)
        return data.get(data_source_name)

    def data_sources_deep(self, database: str = "sageworks", refresh: bool = False) -> dict:
        """Get a deeper set of data for the Data Sources in AWS

        Args:
            database (str, optional): Glue database. Defaults to 'sageworks'.
            refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

        Returns:
            dict: Detailed information about all the Data Sources in AWS
        """
        if refresh:
            self.log.monitor("DataSources Deep Dive Force Refresh...")
        data = self.aws_broker.get_metadata(ServiceCategory.DATA_CATALOG, force_refresh=refresh)

        # Data Sources are in two databases, 'sageworks' and 'sagemaker_featurestore'
        data = data[database]

        # Return the data
        return data

    def views(self, database: str = "sageworks") -> pd.DataFrame:
        """Get a summary of the all the Views, for the given database, in AWS

        Args:
            database (str, optional): Glue database. Defaults to 'sageworks'.

        Returns:
            pd.DataFrame: A summary of all the Views, for the given database, in AWS
        """
        view_data = self.views_deep(database=database)
        view_summary = []

        # Pull in various bits of metadata for each data source
        for name, info in view_data.items():
            summary = {
                "Name": name,
                "Modified": datetime_string(info.get("UpdateTime")),
                "Num Columns": num_columns_ds(info),
            }
            view_summary.append(summary)

        # Return the summary
        return pd.DataFrame(view_summary)

    def view_details(self, view_name: str, database: str = "sageworks", refresh: bool = False) -> Union[dict, None]:
        """Get detailed information about a specific View in AWS

        Args:
            view_name (str): The name of the View
            database (str, optional): Glue database. Defaults to 'sageworks'.
            refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

        Returns:
            dict: Detailed information about the view (or None if not found)
        """
        data = self.views_deep(database=database, refresh=refresh)
        return data.get(view_name)

    def views_deep(self, database: str = "sageworks", refresh: bool = False) -> dict:
        """Get a deeper set of data for the Views in Athena/AWS

        Args:
            database (str, optional): Glue database. Defaults to 'sageworks'.
            refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

        Returns:
            dict:  Detailed information about all the Views in AWS
        """
        if refresh:
            self.log.monitor("Views Deep Dive Force Refresh...")
        data = self.aws_broker.get_metadata(ServiceCategory.DATA_CATALOG, force_refresh=refresh)

        # Views are in two databases, 'sageworks' and 'sagemaker_featurestore'
        if "views" not in data:
            self.log.warning("No views found in the metadata")
            return {}
        data = data["views"][database]

        # Return the data
        return data

    def feature_sets(self, refresh: bool = False) -> pd.DataFrame:
        """Get a summary of the Feature Sets in AWS

        Args:
            refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

        Returns:
            pd.DataFrame: A summary of the Feature Sets in AWS
        """
        data = self.feature_sets_deep(refresh)
        data_summary = []

        # Pull in various bits of metadata for each feature set
        for name, group_info in data.items():
            sageworks_meta = group_info.get("sageworks_meta", {})
            summary = {
                "Feature Group": group_info["FeatureGroupName"],
                "Created": datetime_string(group_info.get("CreationTime")),
                "Num Columns": num_columns_fs(group_info),
                "Input": sageworks_meta.get("sageworks_input", "-"),
                "Tags": sageworks_meta.get("sageworks_tags", "-"),
                "Online": str(group_info.get("OnlineStoreConfig", {}).get("EnableOnlineStore", "False")),
                "_aws_url": aws_url(group_info, "FeatureSet", self.aws_account_clamp),  # Hidden Column
            }
            data_summary.append(summary)

        # Return the summary
        return pd.DataFrame(data_summary)

    def feature_set_details(self, feature_set_name: str) -> dict:
        """Get detailed information about a specific feature set in AWS

        Args:
            feature_set_name (str): The name of the feature set

        Returns:
            dict: Detailed information about the feature set
        """
        data = self.feature_sets_deep()
        return data.get(feature_set_name, {})

    def feature_sets_deep(self, refresh: bool = False) -> dict:
        """Get a deeper set of data for the Feature Sets in AWS

        Args:
            refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

        Returns:
            dict: A summary of the Feature Sets in AWS
        """
        if refresh:
            self.log.monitor("FeatureSets Deep Dive Force Refresh...")
        return self.aws_broker.get_metadata(ServiceCategory.FEATURE_STORE, force_refresh=refresh)

    def models(self, refresh: bool = False) -> pd.DataFrame:
        """Get a summary of the Models in AWS

        Args:
            refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

        Returns:
            pd.DataFrame: A summary of the Models in AWS
        """
        model_data = self.models_deep(refresh)
        model_summary = []
        for model_group_name, model_list in model_data.items():

            # Sanity check for empty Model Package Groups
            if not model_list:
                self.log.error(f"No models found in {model_group_name}")
                summary = {"Model Group": model_group_name, "Health": "failed", "Owner": "empty", "Model Type": "empty"}
                summary.update({key: "-" for key in ["Created", "Ver", "Tags", "Input", "Status", "Description"]})
                model_summary.append(summary)
                continue

            # Get Summary information for the 'latest' model in the model_list
            latest_model = model_list[0]
            sageworks_meta = latest_model.get("sageworks_meta", {})

            # If the sageworks_health_tags have nothing in them, then the model is healthy
            health_tags = sageworks_meta.get("sageworks_health_tags", "-")
            health_tags = health_tags if health_tags else "healthy"
            summary = {
                "Model Group": latest_model["ModelPackageGroupName"],
                "Health": health_tags,
                "Owner": sageworks_meta.get("sageworks_owner", "-"),
                "Model Type": sageworks_meta.get("sageworks_model_type"),
                "Created": datetime_string(latest_model.get("CreationTime")),
                "Ver": latest_model["ModelPackageVersion"],
                "Tags": sageworks_meta.get("sageworks_tags", "-"),
                "Input": sageworks_meta.get("sageworks_input", "-"),
                "Status": latest_model["ModelPackageStatus"],
                "Description": latest_model.get("ModelPackageDescription", "-"),
            }
            model_summary.append(summary)

        # Return the summary
        return pd.DataFrame(model_summary)

    def model_details(self, model_group_name: str) -> dict:
        """Get detailed information about a specific model group in AWS

        Args:
            model_group_name (str): The name of the model group

        Returns:
            dict: Detailed information about the model group
        """
        data = self.models_deep()
        return data.get(model_group_name, {})

    def models_deep(self, refresh: bool = False) -> dict:
        """Get a deeper set of data for Models in AWS

         Args:
            refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

        Returns:
            dict: A summary of the Models in AWS
        """
        if refresh:
            self.log.monitor("Models Deep Dive Force Refresh...")
        return self.aws_broker.get_metadata(ServiceCategory.MODELS, force_refresh=refresh)

    def endpoints(self, refresh: bool = False) -> pd.DataFrame:
        """Get a summary of the Endpoints in AWS

        Args:
            refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

        Returns:
            pd.DataFrame: A summary of the Endpoints in AWS
        """
        data = self.endpoints_deep(refresh)
        data_summary = []

        # Get Summary information for each endpoint
        for endpoint, endpoint_info in data.items():
            # Get the SageWorks metadata for this Endpoint
            sageworks_meta = endpoint_info.get("sageworks_meta", {})

            # If the sageworks_health_tags have nothing in them, then the endpoint is healthy
            health_tags = sageworks_meta.get("sageworks_health_tags", "-")
            health_tags = health_tags if health_tags else "healthy"
            summary = {
                "Name": endpoint_info["EndpointName"],
                "Health": health_tags,
                "Instance": endpoint_info.get("InstanceType", "-"),
                "Created": datetime_string(endpoint_info.get("CreationTime")),
                "Tags": sageworks_meta.get("sageworks_tags", "-"),
                "Input": sageworks_meta.get("sageworks_input", "-"),
                "Status": endpoint_info["EndpointStatus"],
                "Variant": endpoint_info.get("ProductionVariants", [{}])[0].get("VariantName", "-"),
                "Capture": str(endpoint_info.get("DataCaptureConfig", {}).get("EnableCapture", "False")),
                "Samp(%)": str(endpoint_info.get("DataCaptureConfig", {}).get("CurrentSamplingPercentage", "-")),
            }
            data_summary.append(summary)

        # Return the summary
        return pd.DataFrame(data_summary)

    def endpoints_deep(self, refresh: bool = False) -> dict:
        """Get a deeper set of data for Endpoints in AWS

        Args:
            refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

        Returns:
            dict: A summary of the Endpoints in AWS
        """
        if refresh:
            self.log.monitor("Endpoints Deep Dive Force Refresh...")
        return self.aws_broker.get_metadata(ServiceCategory.ENDPOINTS, force_refresh=refresh)

    def pipelines(self, refresh: bool = False) -> pd.DataFrame:
        """Get a summary of the SageWorks Pipelines

        Args:
            refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

        Returns:
            pd.DataFrame: A summary of the SageWorks Pipelines
        """
        data = self.pipeline_manager.list_pipelines()

        # Return the pipelines summary as a DataFrame
        return pd.DataFrame(data)

    def _remove_sageworks_meta(self, data: dict) -> dict:
        """Internal: Recursively remove any keys with 'sageworks_' in them"""

        # Recursively exclude any keys with 'sageworks_' in them
        summary_data = {}
        for key, value in data.items():
            if isinstance(value, dict):
                summary_data[key] = self._remove_sageworks_meta(value)
            elif not key.startswith("sageworks_"):
                summary_data[key] = value
        return summary_data

    def refresh_all_aws_meta(self) -> None:
        """Force a refresh of all the metadata"""
        self.aws_broker.get_all_metadata(force_refresh=True)

__init__()

Meta Initialization

Source code in src/sageworks/api/meta.py
def __init__(self):
    """Meta Initialization"""
    self.log = logging.getLogger("sageworks")

    # Account and Service Brokers
    self.aws_account_clamp = AWSAccountClamp()
    self.aws_broker = AWSServiceBroker()
    self.cm = ConfigManager()

    # Pipeline Manager
    self.pipeline_manager = PipelineManager()

account()

Print out the AWS Account Info

Returns:

Name Type Description
dict dict

The AWS Account Info

Source code in src/sageworks/api/meta.py
def account(self) -> dict:
    """Print out the AWS Account Info

    Returns:
        dict: The AWS Account Info
    """
    return self.aws_account_clamp.get_aws_account_info()

config()

Return the current SageWorks Configuration

Returns:

Name Type Description
dict dict

The current SageWorks Configuration

Source code in src/sageworks/api/meta.py
def config(self) -> dict:
    """Return the current SageWorks Configuration

    Returns:
        dict: The current SageWorks Configuration
    """
    return self.cm.get_all_config()

data_source_details(data_source_name, database='sageworks', refresh=False)

Get detailed information about a specific data source in AWS

Parameters:

Name Type Description Default
data_source_name str

The name of the data source

required
database str

Glue database. Defaults to 'sageworks'.

'sageworks'
refresh bool

Force a refresh of the metadata. Defaults to False.

False

Returns:

Name Type Description
dict Union[dict, None]

Detailed information about the data source (or None if not found)

Source code in src/sageworks/api/meta.py
def data_source_details(
    self, data_source_name: str, database: str = "sageworks", refresh: bool = False
) -> Union[dict, None]:
    """Get detailed information about a specific data source in AWS

    Args:
        data_source_name (str): The name of the data source
        database (str, optional): Glue database. Defaults to 'sageworks'.
        refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

    Returns:
        dict: Detailed information about the data source (or None if not found)
    """
    data = self.data_sources_deep(database=database, refresh=refresh)
    return data.get(data_source_name)

data_sources()

Get a summary of the Data Sources in AWS

Returns:

Type Description
DataFrame

pd.DataFrame: A summary of the Data Sources in AWS

Source code in src/sageworks/api/meta.py
def data_sources(self) -> pd.DataFrame:
    """Get a summary of the Data Sources in AWS

    Returns:
        pd.DataFrame: A summary of the Data Sources in AWS
    """
    data = self.data_sources_deep()
    data_summary = []

    # Pull in various bits of metadata for each data source
    for name, info in data.items():
        summary = {
            "Name": name,
            "Modified": datetime_string(info.get("UpdateTime")),
            "Num Columns": num_columns_ds(info),
            "Tags": info.get("Parameters", {}).get("sageworks_tags", "-"),
            "Input": str(
                info.get("Parameters", {}).get("sageworks_input", "-"),
            ),
            "_aws_url": aws_url(info, "DataSource", self.aws_account_clamp),  # Hidden Column
        }
        data_summary.append(summary)

    # Return the summary
    return pd.DataFrame(data_summary)

data_sources_deep(database='sageworks', refresh=False)

Get a deeper set of data for the Data Sources in AWS

Parameters:

Name Type Description Default
database str

Glue database. Defaults to 'sageworks'.

'sageworks'
refresh bool

Force a refresh of the metadata. Defaults to False.

False

Returns:

Name Type Description
dict dict

Detailed information about all the Data Sources in AWS

Source code in src/sageworks/api/meta.py
def data_sources_deep(self, database: str = "sageworks", refresh: bool = False) -> dict:
    """Get a deeper set of data for the Data Sources in AWS

    Args:
        database (str, optional): Glue database. Defaults to 'sageworks'.
        refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

    Returns:
        dict: Detailed information about all the Data Sources in AWS
    """
    if refresh:
        self.log.monitor("DataSources Deep Dive Force Refresh...")
    data = self.aws_broker.get_metadata(ServiceCategory.DATA_CATALOG, force_refresh=refresh)

    # Data Sources are in two databases, 'sageworks' and 'sagemaker_featurestore'
    data = data[database]

    # Return the data
    return data

endpoints(refresh=False)

Get a summary of the Endpoints in AWS

Parameters:

Name Type Description Default
refresh bool

Force a refresh of the metadata. Defaults to False.

False

Returns:

Type Description
DataFrame

pd.DataFrame: A summary of the Endpoints in AWS

Source code in src/sageworks/api/meta.py
def endpoints(self, refresh: bool = False) -> pd.DataFrame:
    """Get a summary of the Endpoints in AWS

    Args:
        refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

    Returns:
        pd.DataFrame: A summary of the Endpoints in AWS
    """
    data = self.endpoints_deep(refresh)
    data_summary = []

    # Get Summary information for each endpoint
    for endpoint, endpoint_info in data.items():
        # Get the SageWorks metadata for this Endpoint
        sageworks_meta = endpoint_info.get("sageworks_meta", {})

        # If the sageworks_health_tags have nothing in them, then the endpoint is healthy
        health_tags = sageworks_meta.get("sageworks_health_tags", "-")
        health_tags = health_tags if health_tags else "healthy"
        summary = {
            "Name": endpoint_info["EndpointName"],
            "Health": health_tags,
            "Instance": endpoint_info.get("InstanceType", "-"),
            "Created": datetime_string(endpoint_info.get("CreationTime")),
            "Tags": sageworks_meta.get("sageworks_tags", "-"),
            "Input": sageworks_meta.get("sageworks_input", "-"),
            "Status": endpoint_info["EndpointStatus"],
            "Variant": endpoint_info.get("ProductionVariants", [{}])[0].get("VariantName", "-"),
            "Capture": str(endpoint_info.get("DataCaptureConfig", {}).get("EnableCapture", "False")),
            "Samp(%)": str(endpoint_info.get("DataCaptureConfig", {}).get("CurrentSamplingPercentage", "-")),
        }
        data_summary.append(summary)

    # Return the summary
    return pd.DataFrame(data_summary)

endpoints_deep(refresh=False)

Get a deeper set of data for Endpoints in AWS

Parameters:

Name Type Description Default
refresh bool

Force a refresh of the metadata. Defaults to False.

False

Returns:

Name Type Description
dict dict

A summary of the Endpoints in AWS

Source code in src/sageworks/api/meta.py
def endpoints_deep(self, refresh: bool = False) -> dict:
    """Get a deeper set of data for Endpoints in AWS

    Args:
        refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

    Returns:
        dict: A summary of the Endpoints in AWS
    """
    if refresh:
        self.log.monitor("Endpoints Deep Dive Force Refresh...")
    return self.aws_broker.get_metadata(ServiceCategory.ENDPOINTS, force_refresh=refresh)

feature_set_details(feature_set_name)

Get detailed information about a specific feature set in AWS

Parameters:

Name Type Description Default
feature_set_name str

The name of the feature set

required

Returns:

Name Type Description
dict dict

Detailed information about the feature set

Source code in src/sageworks/api/meta.py
def feature_set_details(self, feature_set_name: str) -> dict:
    """Get detailed information about a specific feature set in AWS

    Args:
        feature_set_name (str): The name of the feature set

    Returns:
        dict: Detailed information about the feature set
    """
    data = self.feature_sets_deep()
    return data.get(feature_set_name, {})

feature_sets(refresh=False)

Get a summary of the Feature Sets in AWS

Parameters:

Name Type Description Default
refresh bool

Force a refresh of the metadata. Defaults to False.

False

Returns:

Type Description
DataFrame

pd.DataFrame: A summary of the Feature Sets in AWS

Source code in src/sageworks/api/meta.py
def feature_sets(self, refresh: bool = False) -> pd.DataFrame:
    """Get a summary of the Feature Sets in AWS

    Args:
        refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

    Returns:
        pd.DataFrame: A summary of the Feature Sets in AWS
    """
    data = self.feature_sets_deep(refresh)
    data_summary = []

    # Pull in various bits of metadata for each feature set
    for name, group_info in data.items():
        sageworks_meta = group_info.get("sageworks_meta", {})
        summary = {
            "Feature Group": group_info["FeatureGroupName"],
            "Created": datetime_string(group_info.get("CreationTime")),
            "Num Columns": num_columns_fs(group_info),
            "Input": sageworks_meta.get("sageworks_input", "-"),
            "Tags": sageworks_meta.get("sageworks_tags", "-"),
            "Online": str(group_info.get("OnlineStoreConfig", {}).get("EnableOnlineStore", "False")),
            "_aws_url": aws_url(group_info, "FeatureSet", self.aws_account_clamp),  # Hidden Column
        }
        data_summary.append(summary)

    # Return the summary
    return pd.DataFrame(data_summary)

feature_sets_deep(refresh=False)

Get a deeper set of data for the Feature Sets in AWS

Parameters:

Name Type Description Default
refresh bool

Force a refresh of the metadata. Defaults to False.

False

Returns:

Name Type Description
dict dict

A summary of the Feature Sets in AWS

Source code in src/sageworks/api/meta.py
def feature_sets_deep(self, refresh: bool = False) -> dict:
    """Get a deeper set of data for the Feature Sets in AWS

    Args:
        refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

    Returns:
        dict: A summary of the Feature Sets in AWS
    """
    if refresh:
        self.log.monitor("FeatureSets Deep Dive Force Refresh...")
    return self.aws_broker.get_metadata(ServiceCategory.FEATURE_STORE, force_refresh=refresh)

glue_jobs()

Get summary data about AWS Glue Jobs

Source code in src/sageworks/api/meta.py
def glue_jobs(self) -> pd.DataFrame:
    """Get summary data about AWS Glue Jobs"""
    glue_meta = self.glue_jobs_deep()
    glue_summary = []

    # Get the information about each Glue Job
    for name, info in glue_meta.items():
        summary = {
            "Name": info["Name"],
            "GlueVersion": info["GlueVersion"],
            "Workers": info.get("NumberOfWorkers", "-"),
            "WorkerType": info.get("WorkerType", "-"),
            "Modified": datetime_string(info.get("LastModifiedOn")),
            "LastRun": datetime_string(info["sageworks_meta"]["last_run"]),
            "Status": info["sageworks_meta"]["status"],
            "_aws_url": aws_url(info, "GlueJob", self.aws_account_clamp),  # Hidden Column
        }
        glue_summary.append(summary)

    # Return the summary
    return pd.DataFrame(glue_summary)

glue_jobs_deep(refresh=False)

Get a deeper set of data for the Glue Jobs in AWS

Parameters:

Name Type Description Default
refresh bool

Force a refresh of the metadata. Defaults to False.

False

Returns:

Name Type Description
dict dict

A summary of the Glue Jobs in AWS

Source code in src/sageworks/api/meta.py
def glue_jobs_deep(self, refresh: bool = False) -> dict:
    """Get a deeper set of data for the Glue Jobs in AWS

    Args:
        refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

    Returns:
        dict: A summary of the Glue Jobs in AWS
    """
    return self.aws_broker.get_metadata(ServiceCategory.GLUE_JOBS, force_refresh=refresh)

incoming_data()

Get summary data about data in the incoming-data S3 Bucket

Returns:

Type Description
DataFrame

pd.DataFrame: A summary of the data in the incoming-data S3 Bucket

Source code in src/sageworks/api/meta.py
def incoming_data(self) -> pd.DataFrame:
    """Get summary data about data in the incoming-data S3 Bucket

    Returns:
        pd.DataFrame: A summary of the data in the incoming-data S3 Bucket
    """
    data = self.incoming_data_deep()
    data_summary = []
    for name, info in data.items():
        # Get the name and the size of the S3 Storage Object(s)
        name = "/".join(name.split("/")[-2:]).replace("incoming-data/", "")
        info["Name"] = name
        size = info.get("ContentLength") / 1_000_000
        summary = {
            "Name": name,
            "Size(MB)": f"{size:.2f}",
            "Modified": datetime_string(info.get("LastModified", "-")),
            "ContentType": str(info.get("ContentType", "-")),
            "ServerSideEncryption": info.get("ServerSideEncryption", "-"),
            "Tags": str(info.get("tags", "-")),
            "_aws_url": aws_url(info, "S3", self.aws_account_clamp),  # Hidden Column
        }
        data_summary.append(summary)

    # Return the summary
    return pd.DataFrame(data_summary)

incoming_data_deep(refresh=False)

Get a deeper set of data for the Incoming Data in AWS

Parameters:

Name Type Description Default
refresh bool

Force a refresh of the metadata. Defaults to False.

False

Returns:

Name Type Description
dict dict

A summary of the Incoming Data in AWS

Source code in src/sageworks/api/meta.py
def incoming_data_deep(self, refresh: bool = False) -> dict:
    """Get a deeper set of data for the Incoming Data in AWS

    Args:
        refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

    Returns:
        dict: A summary of the Incoming Data in AWS
    """
    return self.aws_broker.get_metadata(ServiceCategory.INCOMING_DATA_S3, force_refresh=refresh)

model_details(model_group_name)

Get detailed information about a specific model group in AWS

Parameters:

Name Type Description Default
model_group_name str

The name of the model group

required

Returns:

Name Type Description
dict dict

Detailed information about the model group

Source code in src/sageworks/api/meta.py
def model_details(self, model_group_name: str) -> dict:
    """Get detailed information about a specific model group in AWS

    Args:
        model_group_name (str): The name of the model group

    Returns:
        dict: Detailed information about the model group
    """
    data = self.models_deep()
    return data.get(model_group_name, {})

models(refresh=False)

Get a summary of the Models in AWS

Parameters:

Name Type Description Default
refresh bool

Force a refresh of the metadata. Defaults to False.

False

Returns:

Type Description
DataFrame

pd.DataFrame: A summary of the Models in AWS

Source code in src/sageworks/api/meta.py
def models(self, refresh: bool = False) -> pd.DataFrame:
    """Get a summary of the Models in AWS

    Args:
        refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

    Returns:
        pd.DataFrame: A summary of the Models in AWS
    """
    model_data = self.models_deep(refresh)
    model_summary = []
    for model_group_name, model_list in model_data.items():

        # Sanity check for empty Model Package Groups
        if not model_list:
            self.log.error(f"No models found in {model_group_name}")
            summary = {"Model Group": model_group_name, "Health": "failed", "Owner": "empty", "Model Type": "empty"}
            summary.update({key: "-" for key in ["Created", "Ver", "Tags", "Input", "Status", "Description"]})
            model_summary.append(summary)
            continue

        # Get Summary information for the 'latest' model in the model_list
        latest_model = model_list[0]
        sageworks_meta = latest_model.get("sageworks_meta", {})

        # If the sageworks_health_tags have nothing in them, then the model is healthy
        health_tags = sageworks_meta.get("sageworks_health_tags", "-")
        health_tags = health_tags if health_tags else "healthy"
        summary = {
            "Model Group": latest_model["ModelPackageGroupName"],
            "Health": health_tags,
            "Owner": sageworks_meta.get("sageworks_owner", "-"),
            "Model Type": sageworks_meta.get("sageworks_model_type"),
            "Created": datetime_string(latest_model.get("CreationTime")),
            "Ver": latest_model["ModelPackageVersion"],
            "Tags": sageworks_meta.get("sageworks_tags", "-"),
            "Input": sageworks_meta.get("sageworks_input", "-"),
            "Status": latest_model["ModelPackageStatus"],
            "Description": latest_model.get("ModelPackageDescription", "-"),
        }
        model_summary.append(summary)

    # Return the summary
    return pd.DataFrame(model_summary)

models_deep(refresh=False)

Get a deeper set of data for Models in AWS

Args: refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

Returns:

Name Type Description
dict dict

A summary of the Models in AWS

Source code in src/sageworks/api/meta.py
def models_deep(self, refresh: bool = False) -> dict:
    """Get a deeper set of data for Models in AWS

     Args:
        refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

    Returns:
        dict: A summary of the Models in AWS
    """
    if refresh:
        self.log.monitor("Models Deep Dive Force Refresh...")
    return self.aws_broker.get_metadata(ServiceCategory.MODELS, force_refresh=refresh)

pipelines(refresh=False)

Get a summary of the SageWorks Pipelines

Parameters:

Name Type Description Default
refresh bool

Force a refresh of the metadata. Defaults to False.

False

Returns:

Type Description
DataFrame

pd.DataFrame: A summary of the SageWorks Pipelines

Source code in src/sageworks/api/meta.py
def pipelines(self, refresh: bool = False) -> pd.DataFrame:
    """Get a summary of the SageWorks Pipelines

    Args:
        refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

    Returns:
        pd.DataFrame: A summary of the SageWorks Pipelines
    """
    data = self.pipeline_manager.list_pipelines()

    # Return the pipelines summary as a DataFrame
    return pd.DataFrame(data)

refresh_all_aws_meta()

Force a refresh of all the metadata

Source code in src/sageworks/api/meta.py
def refresh_all_aws_meta(self) -> None:
    """Force a refresh of all the metadata"""
    self.aws_broker.get_all_metadata(force_refresh=True)

view_details(view_name, database='sageworks', refresh=False)

Get detailed information about a specific View in AWS

Parameters:

Name Type Description Default
view_name str

The name of the View

required
database str

Glue database. Defaults to 'sageworks'.

'sageworks'
refresh bool

Force a refresh of the metadata. Defaults to False.

False

Returns:

Name Type Description
dict Union[dict, None]

Detailed information about the view (or None if not found)

Source code in src/sageworks/api/meta.py
def view_details(self, view_name: str, database: str = "sageworks", refresh: bool = False) -> Union[dict, None]:
    """Get detailed information about a specific View in AWS

    Args:
        view_name (str): The name of the View
        database (str, optional): Glue database. Defaults to 'sageworks'.
        refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

    Returns:
        dict: Detailed information about the view (or None if not found)
    """
    data = self.views_deep(database=database, refresh=refresh)
    return data.get(view_name)

views(database='sageworks')

Get a summary of the all the Views, for the given database, in AWS

Parameters:

Name Type Description Default
database str

Glue database. Defaults to 'sageworks'.

'sageworks'

Returns:

Type Description
DataFrame

pd.DataFrame: A summary of all the Views, for the given database, in AWS

Source code in src/sageworks/api/meta.py
def views(self, database: str = "sageworks") -> pd.DataFrame:
    """Get a summary of the all the Views, for the given database, in AWS

    Args:
        database (str, optional): Glue database. Defaults to 'sageworks'.

    Returns:
        pd.DataFrame: A summary of all the Views, for the given database, in AWS
    """
    view_data = self.views_deep(database=database)
    view_summary = []

    # Pull in various bits of metadata for each data source
    for name, info in view_data.items():
        summary = {
            "Name": name,
            "Modified": datetime_string(info.get("UpdateTime")),
            "Num Columns": num_columns_ds(info),
        }
        view_summary.append(summary)

    # Return the summary
    return pd.DataFrame(view_summary)

views_deep(database='sageworks', refresh=False)

Get a deeper set of data for the Views in Athena/AWS

Parameters:

Name Type Description Default
database str

Glue database. Defaults to 'sageworks'.

'sageworks'
refresh bool

Force a refresh of the metadata. Defaults to False.

False

Returns:

Name Type Description
dict dict

Detailed information about all the Views in AWS

Source code in src/sageworks/api/meta.py
def views_deep(self, database: str = "sageworks", refresh: bool = False) -> dict:
    """Get a deeper set of data for the Views in Athena/AWS

    Args:
        database (str, optional): Glue database. Defaults to 'sageworks'.
        refresh (bool, optional): Force a refresh of the metadata. Defaults to False.

    Returns:
        dict:  Detailed information about all the Views in AWS
    """
    if refresh:
        self.log.monitor("Views Deep Dive Force Refresh...")
    data = self.aws_broker.get_metadata(ServiceCategory.DATA_CATALOG, force_refresh=refresh)

    # Views are in two databases, 'sageworks' and 'sagemaker_featurestore'
    if "views" not in data:
        self.log.warning("No views found in the metadata")
        return {}
    data = data["views"][database]

    # Return the data
    return data

Refresh

Setting refresh to True will lead to substantial performance issues, so don't do it :).

Examples

These example show how to use the Meta() class to pull lists of artifacts from AWS. DataSources, FeatureSets, Models, Endpoints and more. If you're building a web interface plugin, the Meta class is a great place to start.

SageWorks REPL

If you'd like to see exactly what data/details you get back from the Meta() class, you can spin up the SageWorks REPL, use the class and test out all the methods. Try it out! SageWorks REPL

Using SageWorks REPL
[●●●]SageWorks:scp_sandbox> meta = Meta()
[●●●]SageWorks:scp_sandbox> model_info = meta.models()
[●●●]SageWorks:scp_sandbox> model_info
               Model Group   Health Owner  ...             Input     Status                Description
0      wine-classification  healthy     -  ...     wine_features  Completed  Wine Classification Model
1  abalone-regression-full  healthy     -  ...  abalone_features  Completed   Abalone Regression Model
2       abalone-regression  healthy     -  ...  abalone_features  Completed   Abalone Regression Model

[3 rows x 10 columns]

List the Models in AWS

meta_list_models.py
from sageworks.api.meta import Meta

# Create our Meta Class and get a list of our Models
meta = Meta()
models = meta.models()

print(f"Number of Models: {len(models)}")
print(models)

# Get more details data on the Endpoints
models_groups = meta.models_deep()
for name, model_versions in models_groups.items():
    print(name)

Output

Number of Models: 3
               Model Group   Health Owner  ...             Input     Status                Description
0      wine-classification  healthy     -  ...     wine_features  Completed  Wine Classification Model
1  abalone-regression-full  healthy     -  ...  abalone_features  Completed   Abalone Regression Model
2       abalone-regression  healthy     -  ...  abalone_features  Completed   Abalone Regression Model

[3 rows x 10 columns]
wine-classification
abalone-regression-full
abalone-regression

Getting Model Performance Metrics

meta_models.py
from sageworks.api.meta import Meta

# Create our Meta Class to get metadata about our Models
meta = Meta()
model_info = meta.models_deep()

# Print out the summary of our Models
for name, info in model_info.items():
    print(f"{name}")
    latest = info[0]  # We get a list of models, so we only want the latest
    print(f"\tARN: {latest['ModelPackageGroupArn']}")
    print(f"\tDescription: {latest['ModelPackageDescription']}")
    print(f"\tTags: {latest['sageworks_meta']['sageworks_tags']}")
    performance_metrics = latest["sageworks_meta"]["sageworks_inference_metrics"]
    print(f"\tPerformance Metrics:")
    print(f"\t\t{performance_metrics}")

Output

wine-classification
    ARN: arn:aws:sagemaker:us-west-2:507740646243:model-package-group/wine-classification
    Description: Wine Classification Model
    Tags: wine::classification
    Performance Metrics:
        [{'wine_class': 'TypeA', 'precision': 1.0, 'recall': 1.0, 'fscore': 1.0, 'roc_auc': 1.0, 'support': 12}, {'wine_class': 'TypeB', 'precision': 1.0, 'recall': 1.0, 'fscore': 1.0, 'roc_auc': 1.0, 'support': 14}, {'wine_class': 'TypeC', 'precision': 1.0, 'recall': 1.0, 'fscore': 1.0, 'roc_auc': 1.0, 'support': 9}]

abalone-regression
    ARN: arn:aws:sagemaker:us-west-2:507740646243:model-package-group/abalone-regression
    Description: Abalone Regression Model
    Tags: abalone::regression
    Performance Metrics:
        [{'MAE': 1.64, 'RMSE': 2.246, 'R2': 0.502, 'MAPE': 16.393, 'MedAE': 1.209, 'NumRows': 834}]

List the Endpoints in AWS

meta_list_endpoints.py
from sageworks.api.meta import Meta

# Create our Meta Class and get a list of our Endpoints
meta = Meta()
endpoints = meta.endpoints()
print(f"Number of Endpoints: {len(endpoints)}")
print(endpoints)

# Get more details data on the Endpoints
endpoints_deep = meta.endpoints_deep()
for name, info in endpoints_deep.items():
    print(name)
    print(info.keys())

Output

Number of Endpoints: 2
                      Name   Health            Instance           Created  ...     Status     Variant Capture Samp(%)
0  wine-classification-end  healthy  Serverless (2GB/5)  2024-03-23 23:09  ...  InService  AllTraffic   False       -
1   abalone-regression-end  healthy  Serverless (2GB/5)  2024-03-23 21:11  ...  InService  AllTraffic   False       -

[2 rows x 10 columns]
wine-classification-end
dict_keys(['EndpointName', 'EndpointArn', 'EndpointConfigName', 'ProductionVariants', 'EndpointStatus', 'CreationTime', 'LastModifiedTime', 'ResponseMetadata', 'InstanceType', 'sageworks_meta'])
abalone-regression-end
dict_keys(['EndpointName', 'EndpointArn', 'EndpointConfigName', 'ProductionVariants', 'EndpointStatus', 'CreationTime', 'LastModifiedTime', 'ResponseMetadata', 'InstanceType', 'sageworks_meta'])

Not Finding some particular AWS Data?

The SageWorks Meta API Class also has _details() methods, so make sure to check those out.