TFDS รองรับ รูปแบบ Croissant 🥐 แล้ว! อ่าน เอกสาร เพื่อทราบข้อมูลเพิ่มเติม

หน้านี้ได้รับการแปลโดย Cloud Translation API

TFDS และความมุ่งมั่น

เอกสารนี้อธิบาย:

TFDS ค้ำประกันเกี่ยวกับความมุ่งมั่น
TFDS อ่านตัวอย่างในลำดับใด
คำเตือนและ gotchas ต่างๆ

ติดตั้ง

ชุดข้อมูล

จำเป็นต้องมีบริบทบางอย่างเพื่อทำความเข้าใจว่า TFDS อ่านข้อมูลอย่างไร

ในช่วงรุ่น TFDS เขียนข้อมูลต้นฉบับในมาตรฐาน .tfrecord ไฟล์ สำหรับชุดข้อมูลขนาดใหญ่หลาย .tfrecord ไฟล์จะถูกสร้างขึ้นมีหลายแต่ละตัวอย่าง เราเรียกกัน .tfrecord ยื่นเศษ

คู่มือนี้ใช้ imagenet ซึ่งมี 1024 ชาร์ด:

import re
import tensorflow_datasets as tfds

imagenet = tfds.builder('imagenet2012')

num_shards = imagenet.info.splits['train'].num_shards
num_examples = imagenet.info.splits['train'].num_examples
print(f'imagenet has {num_shards} shards ({num_examples} examples)')

imagenet has 1024 shards (1281167 examples)

ค้นหารหัสตัวอย่างชุดข้อมูล

คุณสามารถข้ามไปยังส่วนต่อไปนี้ได้หากต้องการทราบเพียงเกี่ยวกับการกำหนดระดับ

ตัวอย่างเช่นชุดข้อมูลที่ระบุแต่ละที่ไม่ซ้ำกันโดย id (เช่น 'imagenet2012-train.tfrecord-01023-of-01024__32' ) คุณสามารถกู้คืนนี้ id โดยผ่าน read_config.add_tfds_id = True ซึ่งจะเพิ่ม 'tfds_id' สำคัญใน Dict จาก tf.data.Dataset

ในบทช่วยสอนนี้ เรากำหนดโปรแกรมอรรถประโยชน์ขนาดเล็กซึ่งจะพิมพ์รหัสตัวอย่างของชุดข้อมูล (แปลงเป็นจำนวนเต็มเพื่อให้มนุษย์อ่านได้ง่ายขึ้น):

def load_dataset(builder, **as_dataset_kwargs):
  """Load the dataset with the tfds_id."""
  read_config = as_dataset_kwargs.pop('read_config', tfds.ReadConfig())
  read_config.add_tfds_id = True  # Set `True` to return the 'tfds_id' key
  return builder.as_dataset(read_config=read_config, **as_dataset_kwargs)

def print_ex_ids(
    builder,
    *,
    take: int,
    skip: int = None,
    **as_dataset_kwargs,
) -> None:
  """Print the example ids from the given dataset split."""
  ds = load_dataset(builder, **as_dataset_kwargs)
  if skip:
    ds = ds.skip(skip)
  ds = ds.take(take)
  exs = [ex['tfds_id'].numpy().decode('utf-8') for ex in ds]
  exs = [id_to_int(tfds_id, builder=builder) for tfds_id in exs]
  print(exs)

def id_to_int(tfds_id: str, builder) -> str:
  """Format the tfds_id in a more human-readable."""
  match = re.match(r'\w+-(\w+).\w+-(\d+)-of-\d+__(\d+)', tfds_id)
  split_name, shard_id, ex_id = match.groups()
  split_info = builder.info.splits[split_name]
  return sum(split_info.shard_lengths[:int(shard_id)]) + int(ex_id)

ความมุ่งมั่นเมื่ออ่าน

ส่วนนี้จะอธิบายการรับประกัน deterministim ของ tfds.load

ด้วย `shuffle_files=False` (เริ่มต้น)

โดยเริ่มต้นให้ผลผลิต TFDS ตัวอย่าง deterministically ( shuffle_files=False )

# Same as: imagenet.as_dataset(split='train').take(20)
print_ex_ids(imagenet, split='train', take=20)
print_ex_ids(imagenet, split='train', take=20)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1251, 1252, 1253, 1254]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1251, 1252, 1253, 1254]

สำหรับผลการดำเนินงาน TFDS อ่านเศษหลายอย่างในเวลาเดียวกันโดยใช้ tf.data.Dataset.interleave เราเห็นในตัวอย่างนี้ว่า TFDS สลับไปชาร์ดที่ 2 หลังจากที่ได้อ่าน 16 ตัวอย่าง ( ..., 14, 15, 1251, 1252, ... ) เพิ่มเติมเกี่ยวกับการร้องประสานเสียงร้อง

ในทำนองเดียวกัน subsplit API ก็ถูกกำหนดเช่นกัน:

print_ex_ids(imagenet, split='train[67%:84%]', take=20)
print_ex_ids(imagenet, split='train[67%:84%]', take=20)

[858382, 858383, 858384, 858385, 858386, 858387, 858388, 858389, 858390, 858391, 858392, 858393, 858394, 858395, 858396, 858397, 859533, 859534, 859535, 859536]
[858382, 858383, 858384, 858385, 858386, 858387, 858388, 858389, 858390, 858391, 858392, 858393, 858394, 858395, 858396, 858397, 859533, 859534, 859535, 859536]

หากคุณกำลังฝึกอบรมมากกว่าหนึ่งยุค, การตั้งค่าดังกล่าวข้างต้นไม่แนะนำให้เป็น epochs ทั้งหมดจะอ่านเศษในลำดับเดียวกัน (เพื่อสุ่มจะถูก จำกัด ds = ds.shuffle(buffer) buffer ขนาด)

ด้วย `shuffle_files=True`

ด้วย shuffle_files=True เศษมีการสับสำหรับแต่ละยุคจึงอ่านไม่ได้กำหนดขึ้นอีกต่อไป

print_ex_ids(imagenet, split='train', shuffle_files=True, take=20)
print_ex_ids(imagenet, split='train', shuffle_files=True, take=20)

[568017, 329050, 329051, 329052, 329053, 329054, 329056, 329055, 568019, 568020, 568021, 568022, 568023, 568018, 568025, 568024, 568026, 568028, 568030, 568031]
[43790, 43791, 43792, 43793, 43796, 43794, 43797, 43798, 43795, 43799, 43800, 43801, 43802, 43803, 43804, 43805, 43806, 43807, 43809, 43810]

ดูสูตรด้านล่างเพื่อรับการสับเปลี่ยนไฟล์ที่กำหนด

คำเตือนการกำหนด: interleave args

เปลี่ยน read_config.interleave_cycle_length , read_config.interleave_block_length จะมีการเปลี่ยนแปลงคำสั่งตัวอย่าง

TFDS อาศัย tf.data.Dataset.interleave โหลดเพียงเศษไม่กี่ครั้งการปรับปรุงประสิทธิภาพและลดการใช้หน่วยความจำ

ลำดับตัวอย่างรับประกันว่าจะเหมือนกันสำหรับค่าคงที่ของ args แทรก ดู เอกสารแทรก ที่จะเข้าใจสิ่ง cycle_length และ block_length ตรงเกินไป

cycle_length=16 , block_length=16 (ค่าเริ่มต้นเช่นเดียวกับด้านบน):

print_ex_ids(imagenet, split='train', take=20)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1251, 1252, 1253, 1254]

cycle_length=3 , block_length=2 :

read_config = tfds.ReadConfig(
    interleave_cycle_length=3,
    interleave_block_length=2,
)
print_ex_ids(imagenet, split='train', read_config=read_config, take=20)

[0, 1, 1251, 1252, 2502, 2503, 2, 3, 1253, 1254, 2504, 2505, 4, 5, 1255, 1256, 2506, 2507, 6, 7]

ในตัวอย่างที่สองเราจะเห็นว่าชุดข้อมูลที่อ่าน 2 ( block_length=2 ) ตัวอย่างในชาร์ดจากนั้นสลับไปชาร์ดต่อไป ทุก 2 * 3 ( cycle_length=3 ) ตัวอย่างก็กลับไปชาร์ดเป็นครั้งแรก ( shard0-ex0, shard0-ex1, shard1-ex0, shard1-ex1, shard2-ex0, shard2-ex1, shard0-ex2, shard0-ex3, shard1-ex2, shard1-ex3, shard2-ex2,... )

การแบ่งย่อยและคำสั่งตัวอย่าง

ตัวอย่างแต่ละคนมีรหัส 0, 1, ..., num_examples-1 subsplit API เลือกชิ้นของตัวอย่าง (เช่น train[:x] เลือก 0, 1, ..., x-1 )

อย่างไรก็ตาม ภายในการแบ่งย่อย ตัวอย่างจะไม่อ่านในลำดับรหัสที่เพิ่มขึ้น (เนื่องจากส่วนย่อยและส่วนแทรก)

โดยเฉพาะอย่างยิ่ง ds.take(x) และ split='train[:x]' ไม่ได้เทียบเท่า!

สามารถเห็นได้ง่ายในตัวอย่างการแทรกสอดด้านบนที่มีตัวอย่างมาจากกลุ่มย่อยต่างๆ

print_ex_ids(imagenet, split='train', take=25)  # tfds.load(..., split='train').take(25)
print_ex_ids(imagenet, split='train[:25]', take=-1)  # tfds.load(..., split='train[:25]')

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1259]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]

หลังจากที่ 16 ตัวอย่าง (block_length) .take(25) สวิทช์ชาร์ดต่อไปในขณะที่ train[:25] อ่านตัวอย่างจากสะเก็ดแรกยังคง

สูตร

รับการสับเปลี่ยนไฟล์ที่กำหนด

มี 2 วิธีในการสับเปลี่ยนแบบกำหนดได้:

การตั้งค่า shuffle_seed หมายเหตุ: สิ่งนี้ต้องเปลี่ยนเมล็ดพันธุ์ในแต่ละยุค มิฉะนั้น ชาร์ดจะถูกอ่านในลำดับเดียวกันระหว่างยุค

read_config = tfds.ReadConfig(
    shuffle_seed=32,
)

# Deterministic order, different from the default shuffle_files=False above
print_ex_ids(imagenet, split='train', shuffle_files=True, read_config=read_config, take=22)
print_ex_ids(imagenet, split='train', shuffle_files=True, read_config=read_config, take=22)

[176411, 176412, 176413, 176414, 176415, 176416, 176417, 176418, 176419, 176420, 176421, 176422, 176423, 176424, 176425, 176426, 710647, 710648, 710649, 710650, 710651, 710652]
[176411, 176412, 176413, 176414, 176415, 176416, 176417, 176418, 176419, 176420, 176421, 176422, 176423, 176424, 176425, 176426, 710647, 710648, 710649, 710650, 710651, 710652]

ใช้ experimental_interleave_sort_fn : นี้จะช่วยให้การควบคุมเต็มรูปแบบผ่านซึ่งเศษจะอ่านและในการสั่งซื้อมากกว่าอาศัย ds.shuffle การสั่งซื้อ

def _reverse_order(file_instructions):
  return list(reversed(file_instructions))

read_config = tfds.ReadConfig(
    experimental_interleave_sort_fn=_reverse_order,
)

# Last shard (01023-of-01024) is read first
print_ex_ids(imagenet, split='train', read_config=read_config, take=5)

[1279916, 1279917, 1279918, 1279919, 1279920]

รับไปป์ไลน์ที่ตัดสินใจล่วงหน้าได้

อันนี้ซับซ้อนกว่า ไม่มีทางแก้ที่ง่ายและน่าพอใจ

โดยไม่ต้อง ds.shuffle และสับกำหนดในทางทฤษฎีมันควรจะเป็นไปได้ที่จะนับเป็นตัวอย่างที่ได้รับการอ่านและอนุมานซึ่งตัวอย่างที่ได้รับการอ่านภายในในแต่ละชิ้นส่วน (เป็นหน้าที่ของ cycle_length , block_length และการสั่งซื้อชิ้นส่วน) จากนั้น skip , take สำหรับแต่ละชิ้นส่วนจะได้รับการฉีดผ่าน experimental_interleave_sort_fn
ด้วย ds.shuffle ก็มีแนวโน้มที่เป็นไปไม่ได้โดยไม่ต้องรีเพลย์ท่อฝึกอบรมเต็มรูปแบบ มันจะต้องมีการบันทึก ds.shuffle รัฐกันชนที่จะอนุมานซึ่งตัวอย่างที่ได้รับการอ่าน ตัวอย่างอาจจะไม่ต่อเนื่อง (เช่น shard5_ex2 , shard5_ex4 อ่าน แต่ไม่ shard5_ex3 )
ด้วย ds.shuffle วิธีหนึ่งที่จะช่วยประหยัด shards_ids ทั้งหมด / example_ids อ่าน (อนุมานจาก tfds_id ) แล้ว deducing คำแนะนำการใช้ไฟล์จากที่

กรณีที่ง่ายที่สุดสำหรับ 1. คือการมี .skip(x).take(y) จับคู่ train[x:x+y] การแข่งขัน มันต้องการ:

ชุด cycle_length=1 (เพื่อให้เศษจะอ่านตามลำดับ)
ชุด shuffle_files=False
อย่าใช้ ds.shuffle

ควรใช้กับชุดข้อมูลขนาดใหญ่ที่มีการฝึกอบรมเพียง 1 ยุคเท่านั้น ตัวอย่างจะถูกอ่านในลำดับการสับเปลี่ยนเริ่มต้น

read_config = tfds.ReadConfig(
    interleave_cycle_length=1,  # Read shards sequentially
)

print_ex_ids(imagenet, split='train', read_config=read_config, skip=40, take=22)
# If the job get pre-empted, using the subsplit API will skip at most `len(shard0)`
print_ex_ids(imagenet, split='train[40:]', read_config=read_config, take=22)

[40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61]
[40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61]

ค้นหาส่วนย่อย/ตัวอย่างที่อ่านสำหรับการแบ่งย่อยที่กำหนด

ด้วย tfds.core.DatasetInfo คุณมีการเข้าถึงโดยตรงไปยังคำแนะนำในการอ่าน

imagenet.info.splits['train[44%:45%]'].file_instructions

[FileInstruction(filename='imagenet2012-train.tfrecord-00450-of-01024', skip=700, take=-1, num_examples=551),
 FileInstruction(filename='imagenet2012-train.tfrecord-00451-of-01024', skip=0, take=-1, num_examples=1251),
 FileInstruction(filename='imagenet2012-train.tfrecord-00452-of-01024', skip=0, take=-1, num_examples=1251),
 FileInstruction(filename='imagenet2012-train.tfrecord-00453-of-01024', skip=0, take=-1, num_examples=1251),
 FileInstruction(filename='imagenet2012-train.tfrecord-00454-of-01024', skip=0, take=-1, num_examples=1252),
 FileInstruction(filename='imagenet2012-train.tfrecord-00455-of-01024', skip=0, take=-1, num_examples=1251),
 FileInstruction(filename='imagenet2012-train.tfrecord-00456-of-01024', skip=0, take=-1, num_examples=1251),
 FileInstruction(filename='imagenet2012-train.tfrecord-00457-of-01024', skip=0, take=-1, num_examples=1251),
 FileInstruction(filename='imagenet2012-train.tfrecord-00458-of-01024', skip=0, take=-1, num_examples=1251),
 FileInstruction(filename='imagenet2012-train.tfrecord-00459-of-01024', skip=0, take=-1, num_examples=1251),
 FileInstruction(filename='imagenet2012-train.tfrecord-00460-of-01024', skip=0, take=1001, num_examples=1001)]