ממשק API של NumPy ב- TensorFlow

הצג באתר TensorFlow.org

הפעל בגוגל קולאב

צפה במקור ב-GitHub

הורד מחברת

סקירה כללית

TensorFlow מיישמת תת-קבוצה של NumPy API , זמין כ- tf.experimental.numpy . זה מאפשר להריץ קוד NumPy, המואץ על ידי TensorFlow, ובמקביל מאפשר גישה לכל ממשקי ה-API של TensorFlow.

להכין

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow.experimental.numpy as tnp
import timeit

print("Using TensorFlow version %s" % tf.__version__)

Using TensorFlow version 2.6.0

הפעלת התנהגות NumPy

כדי להשתמש ב- tnp בתור NumPy, הפעל את התנהגות NumPy עבור TensorFlow:

tnp.experimental_enable_numpy_behavior()

קריאה זו מאפשרת קידום סוגים ב-TensorFlow וגם משנה הסקת סוג, בעת המרת מילוליות לטנזורים, כך שתעמוד בקפדנות רבה יותר בתקן NumPy.

מערך TensorFlow NumPy ND

מופע של tf.experimental.numpy.ndarray , הנקרא ND Array , מייצג מערך צפוף רב-ממדי של dtype נתון המוצב על התקן מסוים. זה כינוי ל- tf.Tensor . בדוק את מחלקת מערך ND עבור שיטות שימושיות כמו ndarray.T , ndarray.reshape , ndarray.ravel ואחרות.

תחילה צור אובייקט מערך ND, ולאחר מכן הפעל שיטות שונות.

# Create an ND array and check out different attributes.
ones = tnp.ones([5, 3], dtype=tnp.float32)
print("Created ND array with shape = %s, rank = %s, "
      "dtype = %s on device = %s\n" % (
          ones.shape, ones.ndim, ones.dtype, ones.device))

# `ndarray` is just an alias to `tf.Tensor`.
print("Is `ones` an instance of tf.Tensor: %s\n" % isinstance(ones, tf.Tensor))

# Try commonly used member functions.
print("ndarray.T has shape %s" % str(ones.T.shape))
print("narray.reshape(-1) has shape %s" % ones.reshape(-1).shape)

Created ND array with shape = (5, 3), rank = 2, dtype = <dtype: 'float32'> on device = /job:localhost/replica:0/task:0/device:GPU:0

Is `ones` an instance of tf.Tensor: True

ndarray.T has shape (3, 5)
narray.reshape(-1) has shape (15,)

הקלד קידום

לממשקי API של TensorFlow NumPy יש סמנטיקה מוגדרת היטב להמרת ליטרלים למערך ND, כמו גם לביצוע קידום סוגים בכניסות למערך ND. אנא ראה np.result_type לפרטים נוספים.

ממשקי API של TensorFlow משאירים את כניסות tf.Tensor ללא שינוי ואינם מבצעים עליהם קידום סוג, בעוד שממשקי API של TensorFlow NumPy מקדמים את כל התשומות לפי כללי קידום מסוג NumPy. בדוגמה הבאה תבצע קידום סוג. ראשית, הפעל הוספה על כניסות מערך ND מסוגים שונים ושימו לב לסוגי הפלט. אף אחד מסוגי הקידום האלה לא יורשה על ידי ממשקי API של TensorFlow.

print("Type promotion for operations")
values = [tnp.asarray(1, dtype=d) for d in
          (tnp.int32, tnp.int64, tnp.float32, tnp.float64)]
for i, v1 in enumerate(values):
  for v2 in values[i + 1:]:
    print("%s + %s => %s" % 
          (v1.dtype.name, v2.dtype.name, (v1 + v2).dtype.name))

Type promotion for operations
int32 + int64 => int64
int32 + float32 => float64
int32 + float64 => float64
int64 + float32 => float64
int64 + float64 => float64
float32 + float64 => float64

לבסוף, המר ליטרלים למערך ND באמצעות ndarray.asarray לב לסוג המתקבל.

print("Type inference during array creation")
print("tnp.asarray(1).dtype == tnp.%s" % tnp.asarray(1).dtype.name)
print("tnp.asarray(1.).dtype == tnp.%s\n" % tnp.asarray(1.).dtype.name)

Type inference during array creation
tnp.asarray(1).dtype == tnp.int64
tnp.asarray(1.).dtype == tnp.float64

בעת המרת ליטרלים למערך ND, NumPy מעדיף סוגים רחבים כמו tnp.int64 ו- tnp.float64 . לעומת זאת, tf.convert_to_tensor מעדיף את סוגי tf.int32 ו- tf.float32 להמרת קבועים ל- tf.Tensor . ממשקי API של TensorFlow NumPy דבקים בהתנהגות NumPy עבור מספרים שלמים. לגבי floats, הארגומנט prefer_float32 של experimental_enable_numpy_behavior מאפשר לך לקבוע אם להעדיף את tf.float32 על פני tf.float64 (ברירת המחדל היא False ). לדוגמה:

tnp.experimental_enable_numpy_behavior(prefer_float32=True)
print("When prefer_float32 is True:")
print("tnp.asarray(1.).dtype == tnp.%s" % tnp.asarray(1.).dtype.name)
print("tnp.add(1., 2.).dtype == tnp.%s" % tnp.add(1., 2.).dtype.name)

tnp.experimental_enable_numpy_behavior(prefer_float32=False)
print("When prefer_float32 is False:")
print("tnp.asarray(1.).dtype == tnp.%s" % tnp.asarray(1.).dtype.name)
print("tnp.add(1., 2.).dtype == tnp.%s" % tnp.add(1., 2.).dtype.name)

When prefer_float32 is True:
tnp.asarray(1.).dtype == tnp.float32
tnp.add(1., 2.).dtype == tnp.float32
When prefer_float32 is False:
tnp.asarray(1.).dtype == tnp.float64
tnp.add(1., 2.).dtype == tnp.float64

שידור

בדומה ל-TensorFlow, NumPy מגדירה סמנטיקה עשירה עבור ערכי "שידור". אתה יכול לעיין במדריך השידורים של NumPy למידע נוסף ולהשוות זאת עם סמנטיקה של שידור TensorFlow .

x = tnp.ones([2, 3])
y = tnp.ones([3])
z = tnp.ones([1, 2, 1])
print("Broadcasting shapes %s, %s and %s gives shape %s" % (
    x.shape, y.shape, z.shape, (x + y + z).shape))

Broadcasting shapes (2, 3), (3,) and (1, 2, 1) gives shape (1, 2, 3)

יצירת אינדקס

NumPy מגדירה כללי אינדקס מתוחכמים מאוד. עיין במדריך NumPy לאינדקס . שימו לב לשימוש במערכים ND כמדדים למטה.

x = tnp.arange(24).reshape(2, 3, 4)

print("Basic indexing")
print(x[1, tnp.newaxis, 1:3, ...], "\n")

print("Boolean indexing")
print(x[:, (True, False, True)], "\n")

print("Advanced indexing")
print(x[1, (0, 0, 1), tnp.asarray([0, 1, 1])])

Basic indexing
tf.Tensor(
[[[16 17 18 19]
  [20 21 22 23]]], shape=(1, 2, 4), dtype=int64) 

Boolean indexing
tf.Tensor(
[[[ 0  1  2  3]
  [ 8  9 10 11]]

 [[12 13 14 15]
  [20 21 22 23]]], shape=(2, 2, 4), dtype=int64) 

Advanced indexing
tf.Tensor([12 13 17], shape=(3,), dtype=int64)

# Mutation is currently not supported
try:
  tnp.arange(6)[1] = -1
except TypeError:
  print("Currently, TensorFlow NumPy does not support mutation.")

Currently, TensorFlow NumPy does not support mutation.

דגם לדוגמה

לאחר מכן, תוכל לראות כיצד ליצור מודל ולהריץ עליו מסקנות. מודל פשוט זה מחיל שכבת relu ואחריה הקרנה ליניארית. חלקים מאוחרים יותר יראו כיצד לחשב מעברי צבע עבור מודל זה באמצעות GradientTape של TensorFlow.

class Model(object):
  """Model with a dense and a linear layer."""

  def __init__(self):
    self.weights = None

  def predict(self, inputs):
    if self.weights is None:
      size = inputs.shape[1]
      # Note that type `tnp.float32` is used for performance.
      stddev = tnp.sqrt(size).astype(tnp.float32)
      w1 = tnp.random.randn(size, 64).astype(tnp.float32) / stddev
      bias = tnp.random.randn(64).astype(tnp.float32)
      w2 = tnp.random.randn(64, 2).astype(tnp.float32) / 8
      self.weights = (w1, bias, w2)
    else:
      w1, bias, w2 = self.weights
    y = tnp.matmul(inputs, w1) + bias
    y = tnp.maximum(y, 0)  # Relu
    return tnp.matmul(y, w2)  # Linear projection

model = Model()
# Create input data and compute predictions.
print(model.predict(tnp.ones([2, 32], dtype=tnp.float32)))

tf.Tensor(
[[-1.7706785  1.1137733]
 [-1.7706785  1.1137733]], shape=(2, 2), dtype=float32)

TensorFlow NumPy ו-NumPy

TensorFlow NumPy מיישמת תת-קבוצה של מפרט NumPy המלא. בעוד סמלים נוספים יתווספו עם הזמן, ישנן תכונות שיטתיות שלא ייתמכו בעתיד הקרוב. אלה כוללים תמיכה ב-NumPy C API, אינטגרציה של Swig, סדר אחסון Fortran, תצוגות ו- stride_tricks וכמה dtype s (כמו np.recarray ו- np.object ). לפרטים נוספים, עיין בתיעוד של TensorFlow NumPy API .

יכולת פעולה הדדית של NumPy

מערכי TensorFlow ND יכולים לפעול יחד עם פונקציות NumPy. אובייקטים אלו מיישמים את ממשק __array__ . NumPy משתמשת בממשק זה כדי להמיר ארגומנטים של פונקציה לערכי np.ndarray לפני עיבודם.

באופן דומה, פונקציות TensorFlow NumPy יכולות לקבל קלט מסוגים שונים כולל np.ndarray . כניסות אלו מומרות למערך ND על ידי קריאת ndarray.asarray עליהם.

המרה של מערך ND אל np.ndarray עשויה להפעיל עותקי נתונים בפועל. אנא עיין בסעיף על עותקי חיץ לפרטים נוספים.

# ND array passed into NumPy function.
np_sum = np.sum(tnp.ones([2, 3]))
print("sum = %s. Class: %s" % (float(np_sum), np_sum.__class__))

# `np.ndarray` passed into TensorFlow NumPy function.
tnp_sum = tnp.sum(np.ones([2, 3]))
print("sum = %s. Class: %s" % (float(tnp_sum), tnp_sum.__class__))

sum = 6.0. Class: <class 'numpy.float64'>
sum = 6.0. Class: <class 'tensorflow.python.framework.ops.EagerTensor'>

# It is easy to plot ND arrays, given the __array__ interface.
labels = 15 + 2 * tnp.random.randn(1, 1000)
_ = plt.hist(labels)

png

עותקי מאגר

ערבוב TensorFlow NumPy עם קוד NumPy עשוי להפעיל עותקי נתונים. הסיבה לכך היא של-TensorFlow NumPy דרישות מחמירות יותר לגבי יישור זיכרון מאלה של NumPy.

כאשר np.ndarray מועבר ל- TensorFlow NumPy, הוא יבדוק דרישות יישור ויפעיל עותק במידת הצורך. בעת העברת מאגר CPU של מערך ND ל-NumPy, בדרך כלל המאגר יעמוד בדרישות היישור ו-NumPy לא תצטרך ליצור עותק.

מערכי ND יכולים להתייחס למאגרים המוצבים בהתקנים שאינם זיכרון המעבד המקומי. במקרים כאלה, הפעלת פונקציית NumPy תפעיל עותקים ברחבי הרשת או המכשיר לפי הצורך.

בהתחשב בכך, שילוב עם קריאות NumPy API צריך להיעשות בדרך כלל בזהירות והמשתמש צריך להיזהר מתקורות של העתקת נתונים. השזירה של שיחות TensorFlow NumPy עם קריאות TensorFlow היא בדרך כלל בטוחה ונמנעת מהעתקת נתונים. עיין בסעיף על יכולת פעולה הדדית של TensorFlow לפרטים נוספים.

עדיפות מפעיל

TensorFlow NumPy מגדיר __array_priority__ גבוהה מזו של NumPy. משמעות הדבר היא שעבור אופרטורים הכוללים גם מערך ND וגם np.ndarray , הראשון יקבל עדיפות, כלומר, קלט np.ndarray יומר למערך ND והיישום TensorFlow NumPy של האופרטור יופעל.

x = tnp.ones([2]) + np.ones([2])
print("x = %s\nclass = %s" % (x, x.__class__))

x = tf.Tensor([2. 2.], shape=(2,), dtype=float64)
class = <class 'tensorflow.python.framework.ops.EagerTensor'>

TF NumPy ו-TensorFlow

TensorFlow NumPy בנוי על גבי TensorFlow ומכאן פועל בצורה חלקה עם TensorFlow.

`tf.Tensor` ומערך ND

מערך ND הוא כינוי ל- tf.Tensor , אז ברור שניתן לערבב אותם מבלי להפעיל עותקי נתונים בפועל.

x = tf.constant([1, 2])
print(x)

# `asarray` and `convert_to_tensor` here are no-ops.
tnp_x = tnp.asarray(x)
print(tnp_x)
print(tf.convert_to_tensor(tnp_x))

# Note that tf.Tensor.numpy() will continue to return `np.ndarray`.
print(x.numpy(), x.numpy().__class__)

tf.Tensor([1 2], shape=(2,), dtype=int32)
tf.Tensor([1 2], shape=(2,), dtype=int32)
tf.Tensor([1 2], shape=(2,), dtype=int32)
[1 2] <class 'numpy.ndarray'>

יכולת פעולה הדדית של TensorFlow

ניתן להעביר מערך ND לממשקי API של TensorFlow, מכיוון שמערך ND הוא רק כינוי ל- tf.Tensor . כפי שהוזכר קודם לכן, פעולה הדדית כזו אינה עושה העתקות נתונים, אפילו עבור נתונים המוצבים על מאיצים או מכשירים מרוחקים.

לעומת זאת, ניתן להעביר אובייקטי tf.Tensor לממשקי API של tf.experimental.numpy , מבלי לבצע העתקות נתונים.

# ND array passed into TensorFlow function.
tf_sum = tf.reduce_sum(tnp.ones([2, 3], tnp.float32))
print("Output = %s" % tf_sum)

# `tf.Tensor` passed into TensorFlow NumPy function.
tnp_sum = tnp.sum(tf.ones([2, 3]))
print("Output = %s" % tnp_sum)

Output = tf.Tensor(6.0, shape=(), dtype=float32)
Output = tf.Tensor(6.0, shape=(), dtype=float32)

שיפועים ויעקוביאנים: tf.GradientTape

ניתן להשתמש ב- GradientTape של TensorFlow להפצה לאחור באמצעות קוד TensorFlow ו- TensorFlow NumPy.

השתמש במודל שנוצר בסעיף לדוגמה , וחשב מעברי צבע וג'קובים.

def create_batch(batch_size=32):
  """Creates a batch of input and labels."""
  return (tnp.random.randn(batch_size, 32).astype(tnp.float32),
          tnp.random.randn(batch_size, 2).astype(tnp.float32))

def compute_gradients(model, inputs, labels):
  """Computes gradients of squared loss between model prediction and labels."""
  with tf.GradientTape() as tape:
    assert model.weights is not None
    # Note that `model.weights` need to be explicitly watched since they
    # are not tf.Variables.
    tape.watch(model.weights)
    # Compute prediction and loss
    prediction = model.predict(inputs)
    loss = tnp.sum(tnp.square(prediction - labels))
  # This call computes the gradient through the computation above.
  return tape.gradient(loss, model.weights)

inputs, labels = create_batch()
gradients = compute_gradients(model, inputs, labels)

# Inspect the shapes of returned gradients to verify they match the
# parameter shapes.
print("Parameter shapes:", [w.shape for w in model.weights])
print("Gradient shapes:", [g.shape for g in gradients])
# Verify that gradients are of type ND array.
assert isinstance(gradients[0], tnp.ndarray)

Parameter shapes: [TensorShape([32, 64]), TensorShape([64]), TensorShape([64, 2])]
Gradient shapes: [TensorShape([32, 64]), TensorShape([64]), TensorShape([64, 2])]

# Computes a batch of jacobians. Each row is the jacobian of an element in the
# batch of outputs w.r.t. the corresponding input batch element.
def prediction_batch_jacobian(inputs):
  with tf.GradientTape() as tape:
    tape.watch(inputs)
    prediction = model.predict(inputs)
  return prediction, tape.batch_jacobian(prediction, inputs)

inp_batch = tnp.ones([16, 32], tnp.float32)
output, batch_jacobian = prediction_batch_jacobian(inp_batch)
# Note how the batch jacobian shape relates to the input and output shapes.
print("Output shape: %s, input shape: %s" % (output.shape, inp_batch.shape))
print("Batch jacobian shape:", batch_jacobian.shape)

Output shape: (16, 2), input shape: (16, 32)
Batch jacobian shape: (16, 2, 32)

הידור עקבות: tf.function

פונקציית tf.low של tf.function פועלת על ידי "קומפילציה" של הקוד ולאחר מכן אופטימיזציה של העקבות הללו לביצועים מהירים בהרבה. עיין במבוא לגרפים ופונקציות .

ניתן להשתמש ב- tf.function כדי לייעל את קוד TensorFlow NumPy גם כן. הנה דוגמה פשוטה להדגמת המהירות. שים לב שגוף הקוד tf.function כולל קריאות לממשקי API של TensorFlow NumPy.

inputs, labels = create_batch(512)
print("Eager performance")
compute_gradients(model, inputs, labels)
print(timeit.timeit(lambda: compute_gradients(model, inputs, labels),
                    number=10) * 100, "ms")

print("\ntf.function compiled performance")
compiled_compute_gradients = tf.function(compute_gradients)
compiled_compute_gradients(model, inputs, labels)  # warmup
print(timeit.timeit(lambda: compiled_compute_gradients(model, inputs, labels),
                    number=10) * 100, "ms")

Eager performance
1.291419400013183 ms

tf.function compiled performance
0.5561202000080812 ms

וקטוריזציה: tf.vectorized_map

ל-TensorFlow יש תמיכה מובנית ל-Vectorizing לולאות מקבילות, המאפשרת מהירות של סדר גודל אחד עד שניים. הגברת המהירות הללו נגישה דרך ממשק ה-API של tf.vectorized_map והן חלות גם על קוד TensorFlow NumPy.

לפעמים שימושי לחשב את השיפוע של כל פלט באצווה עם רכיב אצווה הקלט המתאים. חישוב כזה יכול להתבצע ביעילות באמצעות tf.vectorized_map כפי שמוצג להלן.

@tf.function
def vectorized_per_example_gradients(inputs, labels):
  def single_example_gradient(arg):
    inp, label = arg
    return compute_gradients(model,
                             tnp.expand_dims(inp, 0),
                             tnp.expand_dims(label, 0))
  # Note that a call to `tf.vectorized_map` semantically maps
  # `single_example_gradient` over each row of `inputs` and `labels`.
  # The interface is similar to `tf.map_fn`.
  # The underlying machinery vectorizes away this map loop which gives
  # nice speedups.
  return tf.vectorized_map(single_example_gradient, (inputs, labels))

batch_size = 128
inputs, labels = create_batch(batch_size)

per_example_gradients = vectorized_per_example_gradients(inputs, labels)
for w, p in zip(model.weights, per_example_gradients):
  print("Weight shape: %s, batch size: %s, per example gradient shape: %s " % (
      w.shape, batch_size, p.shape))

Weight shape: (32, 64), batch size: 128, per example gradient shape: (128, 32, 64) 
Weight shape: (64,), batch size: 128, per example gradient shape: (128, 64) 
Weight shape: (64, 2), batch size: 128, per example gradient shape: (128, 64, 2)

# Benchmark the vectorized computation above and compare with
# unvectorized sequential computation using `tf.map_fn`.
@tf.function
def unvectorized_per_example_gradients(inputs, labels):
  def single_example_gradient(arg):
    inp, label = arg
    return compute_gradients(model,
                             tnp.expand_dims(inp, 0),
                             tnp.expand_dims(label, 0))

  return tf.map_fn(single_example_gradient, (inputs, labels),
                   fn_output_signature=(tf.float32, tf.float32, tf.float32))

print("Running vectorized computation")
print(timeit.timeit(lambda: vectorized_per_example_gradients(inputs, labels),
                    number=10) * 100, "ms")

print("\nRunning unvectorized computation")
per_example_gradients = unvectorized_per_example_gradients(inputs, labels)
print(timeit.timeit(lambda: unvectorized_per_example_gradients(inputs, labels),
                    number=10) * 100, "ms")

Running vectorized computation
0.5265710999992734 ms

Running unvectorized computation
40.35122630002661 ms

מיקום המכשיר

TensorFlow NumPy יכול לבצע פעולות על CPUs, GPUs, TPUs והתקנים מרוחקים. הוא משתמש במנגנוני TensorFlow סטנדרטיים למיקום המכשיר. להלן דוגמה פשוטה מראה כיצד לרשום את כל המכשירים ולאחר מכן לבצע חישוב כלשהו במכשיר מסוים.

ל-TensorFlow יש גם ממשקי API לשכפול חישובים בין מכשירים וביצוע הפחתות קולקטיביות שלא יכוונו כאן.

רשימת מכשירים

ניתן להשתמש ב- tf.config.list_logical_devices ו- tf.config.list_physical_devices כדי למצוא באילו מכשירים להשתמש.

print("All logical devices:", tf.config.list_logical_devices())
print("All physical devices:", tf.config.list_physical_devices())

# Try to get the GPU device. If unavailable, fallback to CPU.
try:
  device = tf.config.list_logical_devices(device_type="GPU")[0]
except IndexError:
  device = "/device:CPU:0"

All logical devices: [LogicalDevice(name='/device:CPU:0', device_type='CPU'), LogicalDevice(name='/device:GPU:0', device_type='GPU')]
All physical devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

פעולות הצבת: `tf.device`

ניתן לבצע פעולות במכשיר על ידי קריאה שלו ב- tf.device scope.

print("Using device: %s" % str(device))
# Run operations in the `tf.device` scope.
# If a GPU is available, these operations execute on the GPU and outputs are
# placed on the GPU memory.
with tf.device(device):
  prediction = model.predict(create_batch(5)[0])

print("prediction is placed on %s" % prediction.device)

Using device: LogicalDevice(name='/device:GPU:0', device_type='GPU')
prediction is placed on /job:localhost/replica:0/task:0/device:GPU:0

העתקת מערכי ND בין מכשירים: `tnp.copy`

קריאה ל- tnp.copy , הממוקמת בהיקף מכשיר מסוים, תעתיק את הנתונים לאותו מכשיר, אלא אם הנתונים כבר נמצאים באותו מכשיר.

with tf.device("/device:CPU:0"):
  prediction_cpu = tnp.copy(prediction)
print(prediction.device)
print(prediction_cpu.device)

/job:localhost/replica:0/task:0/device:GPU:0
/job:localhost/replica:0/task:0/device:CPU:0

השוואות ביצועים

TensorFlow NumPy משתמשת בגרעיני TensorFlow שעברו אופטימיזציה גבוהה שניתן לשלוח על מעבדי CPU, GPU ו-TPU. TensorFlow גם מבצעת אופטימיזציות מהדר רבות, כמו פעולת היתוך, שמתורגמות לשיפורי ביצועים וזיכרון. ראה אופטימיזציה של גרפי TensorFlow עם Grappler למידע נוסף.

עם זאת, ל- TensorFlow יש תקורה גבוהה יותר עבור פעולות שיגור בהשוואה ל-NumPy. עבור עומסי עבודה המורכבים מפעולות קטנות (פחות מ-10 מיקרו-שניות בערך), התקורות הללו יכולות לשלוט בזמן הריצה ו-NumPy יכולה לספק ביצועים טובים יותר. במקרים אחרים, TensorFlow אמור לספק ביצועים טובים יותר.

הפעל את המדד למטה כדי להשוות את ביצועי NumPy ו- TensorFlow NumPy עבור גדלי קלט שונים.

def benchmark(f, inputs, number=30, force_gpu_sync=False):
  """Utility to benchmark `f` on each value in `inputs`."""
  times = []
  for inp in inputs:
    def _g():
      if force_gpu_sync:
        one = tnp.asarray(1)
      f(inp)
      if force_gpu_sync:
        with tf.device("CPU:0"):
          tnp.copy(one)  # Force a sync for GPU case

    _g()  # warmup
    t = timeit.timeit(_g, number=number)
    times.append(t * 1000. / number)
  return times


def plot(np_times, tnp_times, compiled_tnp_times, has_gpu, tnp_times_gpu):
  """Plot the different runtimes."""
  plt.xlabel("size")
  plt.ylabel("time (ms)")
  plt.title("Sigmoid benchmark: TF NumPy vs NumPy")
  plt.plot(sizes, np_times, label="NumPy")
  plt.plot(sizes, tnp_times, label="TF NumPy (CPU)")
  plt.plot(sizes, compiled_tnp_times, label="Compiled TF NumPy (CPU)")
  if has_gpu:
    plt.plot(sizes, tnp_times_gpu, label="TF NumPy (GPU)")
  plt.legend()

# Define a simple implementation of `sigmoid`, and benchmark it using
# NumPy and TensorFlow NumPy for different input sizes.

def np_sigmoid(y):
  return 1. / (1. + np.exp(-y))

def tnp_sigmoid(y):
  return 1. / (1. + tnp.exp(-y))

@tf.function
def compiled_tnp_sigmoid(y):
  return tnp_sigmoid(y)

sizes = (2 ** 0, 2 ** 5, 2 ** 10, 2 ** 15, 2 ** 20)
np_inputs = [np.random.randn(size).astype(np.float32) for size in sizes]
np_times = benchmark(np_sigmoid, np_inputs)

with tf.device("/device:CPU:0"):
  tnp_inputs = [tnp.random.randn(size).astype(np.float32) for size in sizes]
  tnp_times = benchmark(tnp_sigmoid, tnp_inputs)
  compiled_tnp_times = benchmark(compiled_tnp_sigmoid, tnp_inputs)

has_gpu = len(tf.config.list_logical_devices("GPU"))
if has_gpu:
  with tf.device("/device:GPU:0"):
    tnp_inputs = [tnp.random.randn(size).astype(np.float32) for size in sizes]
    tnp_times_gpu = benchmark(compiled_tnp_sigmoid, tnp_inputs, 100, True)
else:
  tnp_times_gpu = None
plot(np_times, tnp_times, compiled_tnp_times, has_gpu, tnp_times_gpu)

png