Source code for tamr_unify_client.mastering.project

import json

from tamr_unify_client.base_model import MachineLearningModel
from tamr_unify_client.dataset.resource import Dataset
from tamr_unify_client.mastering.binning_model import BinningModel
from tamr_unify_client.mastering.estimated_pair_counts import EstimatedPairCounts
from tamr_unify_client.mastering.published_cluster.configuration import (
    PublishedClustersConfiguration,
)
from tamr_unify_client.mastering.published_cluster.record import RecordPublishedCluster
from tamr_unify_client.mastering.published_cluster.resource import PublishedCluster
from tamr_unify_client.project.resource import Project


[docs]class MasteringProject(Project): """A Mastering project in Tamr."""
[docs] def pairs(self): """Record pairs generated by Tamr's binning model. Pairs are displayed on the "Pairs" page in the Tamr UI. Call :func:`~tamr_unify_client.dataset.resource.Dataset.refresh` from this dataset to regenerate pairs according to the latest binning model. :returns: The record pairs represented as a dataset. :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` """ alias = self.api_path + "/recordPairs" return Dataset(self.client, None, alias)
[docs] def pair_matching_model(self): """Machine learning model for pair-matching for this Mastering project. Learns from verified labels and predicts categorization labels for unlabeled pairs. Calling :func:`~tamr_unify_client.base_model.MachineLearningModel.predict` from this dataset will produce new (unpublished) clusters. These clusters are displayed on the "Clusters" page in the Tamr UI. :returns: The machine learning model for pair-matching. :rtype: :class:`~tamr_unify_client.base_model.MachineLearningModel` """ alias = self.api_path + "/recordPairsWithPredictions/model" return MachineLearningModel(self.client, None, alias)
[docs] def high_impact_pairs(self): """High-impact pairs as a dataset. Tamr labels pairs as "high-impact" if labeling these pairs would help it learn most quickly (i.e. "Active learning"). High-impact pairs are displayed with a ⚡ lightning bolt icon on the "Pairs" page in the Tamr UI. Call :func:`~tamr_unify_client.dataset.resource.Dataset.refresh` from this dataset to produce new high-impact pairs according to the latest pair-matching model. :returns: The high-impact pairs represented as a dataset. :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` """ alias = self.api_path + "/highImpactPairs" return Dataset(self.client, None, alias)
[docs] def record_clusters(self): """Record Clusters as a dataset. Tamr clusters labeled pairs using pairs model. These clusters populate the cluster review page and get transient cluster ids, rather than published cluster ids (i.e., "Permanent Ids") Call :func:`~tamr_unify_client.dataset.resource.Dataset.refresh` from this dataset to generate clusters based on to the latest pair-matching model. :returns: The record clusters represented as a dataset. :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` """ alias = self.api_path + "/recordClusters" return Dataset(self.client, None, alias)
[docs] def published_clusters(self): """Published record clusters generated by Tamr's pair-matching model. :returns: The published clusters represented as a dataset. :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` """ unified_dataset = self.unified_dataset() # Replace this workaround with a direct API call once API # is fixed. APIs that need to work are: fetching the dataset and # being able to call refresh on resulting dataset. Until then, we grab # the dataset by constructing its name from the corresponding Unified Dataset's name name = unified_dataset.name + "_dedup_published_clusters" canonical = self.client.datasets.by_name(name) resource_json = canonical._data alias = self.api_path + "/publishedClusters" return Dataset.from_json(self.client, resource_json, alias)
[docs] def published_clusters_configuration(self): """Retrieves published clusters configuration for this project. :returns: The published clusters configuration :rtype: :class:`~tamr_unify_client.mastering.published_cluster.configuration.PublishedClustersConfiguration` """ alias = self.api_path + "/publishedClustersConfiguration" resource_json = self.client.get(alias).successful().json() return PublishedClustersConfiguration.from_json( self.client, resource_json, alias )
[docs] def published_cluster_ids(self): """Retrieves published cluster IDs for this project. :returns: The published cluster ID dataset. :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` """ # Replace this workaround with a direct API call once API # is fixed. APIs that need to work are: fetching the dataset and # being able to call refresh on resulting dataset. Until then, we grab # the dataset by constructing its name from the corresponding Unified Dataset's name unified_dataset = self.unified_dataset() name = unified_dataset.name + "_dedup_all_persistent_ids" dataset = self.client.datasets.by_name(name) path = self.api_path + "/allPublishedClusterIds" return Dataset.from_json(self.client, dataset._data, path)
[docs] def published_cluster_stats(self): """Retrieves published cluster stats for this project. :returns: The published cluster stats dataset. :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` """ # Replace this workaround with a direct API call once API # is fixed. APIs that need to work are: fetching the dataset and # being able to call refresh on resulting dataset. Until then, we grab # the dataset by constructing its name from the corresponding Unified Dataset's name unified_dataset = self.unified_dataset() name = unified_dataset.name + "_dedup_published_cluster_stats" dataset = self.client.datasets.by_name(name) path = self.api_path + "/publishedClusterStats" return Dataset.from_json(self.client, dataset._data, path)
[docs] def published_cluster_versions(self, cluster_ids): """Retrieves version information for the specified published clusters. See https://docs.tamr.com/reference#retrieve-published-clusters-given-cluster-ids. :param cluster_ids: The persistent IDs of the clusters to get version information for. :type cluster_ids: iterable[str] :return: A stream of the published clusters. :rtype: Python generator yielding :class:`~tamr_unify_client.mastering.published_cluster.resource.PublishedCluster` """ path = self.api_path + "/publishedClusterVersions" return self._cluster_versions(PublishedCluster, cluster_ids, path)
[docs] def record_published_cluster_versions(self, record_ids): """Retrieves version information for the published clusters of the given records. See https://docs.tamr.com/reference#retrieve-published-clusters-given-record-ids. :param record_ids: The Tamr IDs of the records to get cluster version information for. :type record_ids: iterable[str] :return: A stream of the relevant published clusters. :rtype: Python generator yielding :class:`~tamr_unify_client.mastering.published_cluster.record.RecordPublishedCluster` """ path = self.api_path + "/recordPublishedClusterVersions" return self._cluster_versions(RecordPublishedCluster, record_ids, path)
def _cluster_versions(self, cluster_class, ids, endpoint): """Retrieves version information for published clusters. :param cluster_class: The class to create instances of. :param ids: The IDs of the clusters or records to get version information for. :type ids: iterable[str] :param endpoint: The endpoint to call for versions. :type endpoint: str :return: A stream of the published clusters. """ string_ids = "\n".join(json.dumps(i) for i in ids) with self.client.post(endpoint, data=string_ids, stream=True) as response: for line in response.iter_lines(): yield cluster_class(json.loads(line))
[docs] def estimate_pairs(self): """Returns pair estimate information for a mastering project :return: Pairs Estimate information. :rtype: :class:`~tamr_unify_client.mastering.estimated_pair_counts.EstimatedPairCounts` """ alias = self.api_path + "/estimatedPairCounts" estimate_json = self.client.get(alias).successful().json() info = EstimatedPairCounts.from_json(self.client, estimate_json, api_path=alias) return info
[docs] def record_clusters_with_data(self): """Project's unified dataset with associated clusters. :returns: The record clusters with data represented as a dataset :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` """ unified_dataset = self.unified_dataset() # Replace this workaround with a direct API call once API # is fixed. APIs that need to work are: fetching the dataset and # being able to call refresh on resulting dataset. Until then, we grab # the dataset by constructing its name from the corresponding Unified Dataset's name name = unified_dataset.name + "_dedup_clusters_with_data" dataset = self.client.datasets.by_name(name) dataset.api_path = self.api_path + "/recordClustersWithData" return dataset
[docs] def published_clusters_with_data(self): """Project's unified dataset with associated clusters. :returns: The published clusters with data represented as a dataset :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` """ unified_dataset = self.unified_dataset() name = unified_dataset.name + "_dedup_published_clusters_with_data" dataset = self.client.datasets.by_name(name) dataset.api_path = self.api_path + "/publishedClustersWithData" return dataset
[docs] def binning_model(self): """ Binning model for this project. :return: Binning model for this project. :rtype: :class:`~tamr_unify_client.mastering.binning_model.BinningModel` """ alias = self.api_path + "/binningModel" # Cannot get this resource and so we hard code resource_json = {"relativeId": alias} return BinningModel.from_json(self.client, resource_json, alias)
# super.__repr__ is sufficient