Gets the size of a vocabulary created using tft.vocabulary
.
tft.experimental.get_vocabulary_size_by_name(
vocab_filename: str
) -> tf.Tensor
Used in the notebooks
This is the number of keys in the output vocab_filename
and does not include
number of OOV buckets.
Args |
vocab_filename
|
The name of the vocabulary file whose size is to be
retrieved.
|
Example:
def preprocessing_fn(inputs):
num_oov_buckets = 1
x_int = tft.compute_and_apply_vocabulary(
inputs['x'], vocab_filename='my_vocab',
num_oov_buckets=num_oov_buckets)
depth = (
tft.experimental.get_vocabulary_size_by_name('my_vocab') +
num_oov_buckets)
x_encoded = tf.one_hot(
x_int, depth=tf.cast(depth, tf.int32), dtype=tf.int64)
return {'x_encoded': x_encoded}
raw_data = [dict(x='foo'), dict(x='foo'), dict(x='bar')]
feature_spec = dict(x=tf.io.FixedLenFeature([], tf.string))
raw_data_metadata = tft.DatasetMetadata.from_feature_spec(feature_spec)
with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
transformed_dataset, transform_fn = (
(raw_data, raw_data_metadata)
| tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
transformed_data, transformed_metadata = transformed_dataset
transformed_data
[{'x_encoded': array([1, 0, 0])}, {'x_encoded': array([1, 0, 0])},
{'x_encoded': array([0, 1, 0])}]
Returns |
An integer tensor containing the size of the requested vocabulary.
|
Raises |
ValueError
|
if no vocabulary size found for the given vocab_filename .
|