未验证 提交 1b3cd0fb 编写于 作者: J joejiong 提交者: GitHub

Add random_split and Subset dataset (#29291) (#32090)

As the title
上级 62c21734
......@@ -19,7 +19,7 @@ import paddle.dataset.common
__all__ = [
"Dataset", "IterableDataset", "TensorDataset", "ComposeDataset",
"ChainDataset"
"ChainDataset", "random_split", "Subset"
]
......@@ -405,3 +405,131 @@ class ChainDataset(IterableDataset):
for dataset in self.datasets:
for sample in dataset:
yield sample
class Subset(Dataset):
"""
Subset of a dataset at specified indices.
Args:
dataset (Dataset): The whole Dataset.
indices (sequence): Indices in the whole set selected for subset.
Returns:
Dataset: A Dataset which is the subset of the original dataset.
Example code:
.. code-block:: python
import paddle
from paddle.io import Subset
# example 1:
a = paddle.io.Subset(dataset=range(1, 4), indices=[0, 2])
print(list(a))
# [1, 3]
# example 2:
b = paddle.io.Subset(dataset=range(1, 4), indices=[1, 1])
print(list(b))
# [2, 2]
"""
def __init__(self, dataset, indices):
self.dataset = dataset
self.indices = indices
def __getitem__(self, idx):
return self.dataset[self.indices[idx]]
def __len__(self):
return len(self.indices)
def random_split(dataset, lengths, generator=None):
"""
Randomly split a dataset into non-overlapping new datasets of given lengths.
Optionally fix the generator for reproducible results, e.g.:
Args:
dataset (Dataset): Dataset to be split
lengths (sequence): lengths of splits to be produced
generator (Generator, optional): Generator used for the random permutation. Default is None then the DefaultGenerator is used in manual_seed().
Returns:
Datasets: A list of subset Datasets, which are the non-overlapping subsets of the original Dataset.
Example code:
.. code-block:: python
import paddle
from paddle.io import random_split
a_list = paddle.io.random_split(range(10), [3, 7])
print(len(a_list))
# 2
for idx, v in enumerate(a_list[0]):
print(idx, v)
# output of the first subset
# 0 1
# 1 3
# 2 9
for idx, v in enumerate(a_list[1]):
print(idx, v)
# output of the second subset
# 0 5
# 1 7
# 2 8
# 3 6
# 4 0
# 5 2
# 6 4
"""
# Cannot verify that dataset is Sized
if sum(lengths) != len(dataset): # type: ignore
raise ValueError(
"Sum of input lengths does not equal the length of the input dataset!"
)
# TODO(@Joejiong): support Variable or Tensor type with .tolist class member function.
# For example var.item() and var.tolist()
indices = paddle.randperm(sum(lengths)).numpy().tolist()
return [
Subset(dataset, indices[offset - length:offset])
for offset, length in zip(_accumulate(lengths), lengths)
]
def _accumulate(iterable, fn=lambda x, y: x + y):
"""
Return running totals
Args:
iterable: any iterable object for example dataset.
y (x): one element in the iterable object.
fn (x, y): Defaults to lambdax.
Yields:
yields total from beginning iterator to current iterator.
Example code:
.. code-block:: python
_accumulate([1,2,3,4,5]) --> 1 3 6 10 15
_accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120
"""
it = iter(iterable)
try:
total = next(it)
except StopIteration:
return
yield total
for element in it:
total = fn(total, element)
yield total
......@@ -20,8 +20,7 @@ import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.io import Dataset, IterableDataset, TensorDataset, \
ComposeDataset, ChainDataset, DataLoader
from paddle.fluid.dygraph.base import to_variable
ComposeDataset, ChainDataset, DataLoader, random_split, Subset
IMAGE_SIZE = 32
......@@ -54,14 +53,14 @@ class RandomIterableDataset(IterableDataset):
class TestTensorDataset(unittest.TestCase):
def run_main(self, num_workers, places):
fluid.default_startup_program().random_seed = 1
fluid.default_main_program().random_seed = 1
place = fluid.CPUPlace()
paddle.static.default_startup_program().random_seed = 1
paddle.static.default_main_program().random_seed = 1
place = paddle.CPUPlace()
with fluid.dygraph.guard(place):
input_np = np.random.random([16, 3, 4]).astype('float32')
input = to_variable(input_np)
input = paddle.to_tensor(input_np)
label_np = np.random.random([16, 1]).astype('int32')
label = to_variable(label_np)
label = paddle.to_tensor(label_np)
dataset = TensorDataset([input, label])
assert len(dataset) == 16
......@@ -83,17 +82,17 @@ class TestTensorDataset(unittest.TestCase):
assert np.allclose(label.numpy(), label_np[i])
def test_main(self):
places = [fluid.CPUPlace()]
if fluid.core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
places = [paddle.CPUPlace()]
if paddle.is_compiled_with_cuda():
places.append(paddle.CUDAPlace(0))
for p in places:
self.run_main(num_workers=0, places=p)
class TestComposeDataset(unittest.TestCase):
def test_main(self):
fluid.default_startup_program().random_seed = 1
fluid.default_main_program().random_seed = 1
paddle.static.default_startup_program().random_seed = 1
paddle.static.default_main_program().random_seed = 1
dataset1 = RandomDataset(10)
dataset2 = RandomDataset(10)
......@@ -110,10 +109,104 @@ class TestComposeDataset(unittest.TestCase):
assert np.allclose(label2, label2_t)
class TestRandomSplitApi(unittest.TestCase):
def test_main(self):
paddle.static.default_startup_program().random_seed = 1
paddle.static.default_main_program().random_seed = 1
dataset1, dataset2 = paddle.io.random_split(range(5), [1, 4])
self.assertTrue(len(dataset1) == 1)
self.assertTrue(len(dataset2) == 4)
elements_list = list(range(5))
for _, val in enumerate(dataset1):
elements_list.remove(val)
for _, val in enumerate(dataset2):
elements_list.remove(val)
self.assertTrue(len(elements_list) == 0)
class TestRandomSplitError(unittest.TestCase):
def test_errors(self):
paddle.static.default_startup_program().random_seed = 1
paddle.static.default_main_program().random_seed = 1
self.assertRaises(ValueError, paddle.io.random_split, range(5), [3, 8])
self.assertRaises(ValueError, paddle.io.random_split, range(5), [8])
self.assertRaises(ValueError, paddle.io.random_split, range(5), [])
class TestSubsetDataset(unittest.TestCase):
def run_main(self, num_workers, places):
paddle.static.default_startup_program().random_seed = 1
paddle.static.default_main_program().random_seed = 1
input_np = np.random.random([5, 3, 4]).astype('float32')
input = paddle.to_tensor(input_np)
label_np = np.random.random([5, 1]).astype('int32')
label = paddle.to_tensor(label_np)
dataset = TensorDataset([input, label])
even_subset = paddle.io.Subset(dataset, [0, 2, 4])
odd_subset = paddle.io.Subset(dataset, [1, 3])
assert len(dataset) == 5
def prepare_dataloader(dataset):
return DataLoader(
dataset,
places=places,
num_workers=num_workers,
batch_size=1,
drop_last=True)
dataloader = prepare_dataloader(dataset)
dataloader_even = prepare_dataloader(even_subset)
dataloader_odd = prepare_dataloader(odd_subset)
def assert_basic(input, label):
assert len(input) == 1
assert len(label) == 1
assert input.shape == [1, 3, 4]
assert label.shape == [1, 1]
assert isinstance(input, paddle.Tensor)
assert isinstance(label, paddle.Tensor)
elements_list = list()
for _, (input, label) in enumerate(dataloader()):
assert_basic(input, label)
elements_list.append(label)
for _, (input, label) in enumerate(dataloader_even()):
assert_basic(input, label)
elements_list.remove(label)
odd_list = list()
for _, (input, label) in enumerate(dataloader_odd()):
assert_basic(input, label)
odd_list.append(label)
self.assertEqual(odd_list, elements_list)
def test_main(self):
paddle.static.default_startup_program().random_seed = 1
paddle.static.default_main_program().random_seed = 1
places = [paddle.CPUPlace()]
if paddle.is_compiled_with_cuda():
places.append(paddle.CUDAPlace(0))
for p in places:
self.run_main(num_workers=0, places=p)
class TestChainDataset(unittest.TestCase):
def run_main(self, num_workers, places):
fluid.default_startup_program().random_seed = 1
fluid.default_main_program().random_seed = 1
paddle.static.default_startup_program().random_seed = 1
paddle.static.default_main_program().random_seed = 1
dataset1 = RandomIterableDataset(10)
dataset2 = RandomIterableDataset(10)
......@@ -135,9 +228,9 @@ class TestChainDataset(unittest.TestCase):
idx += 1
def test_main(self):
places = [fluid.CPUPlace()]
if fluid.core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
places = [paddle.CPUPlace()]
if paddle.is_compiled_with_cuda():
places.append(paddle.CUDAPlace(0))
for p in places:
self.run_main(num_workers=0, places=p)
......
......@@ -28,9 +28,11 @@ __all__ = [
'SequenceSampler',
'RandomSampler',
'WeightedRandomSampler',
'random_split',
'Subset'
]
from ..fluid.io import DataLoader
from ..fluid.dataloader import Dataset, IterableDataset, BatchSampler, get_worker_info, \
TensorDataset, Sampler, SequenceSampler, RandomSampler, DistributedBatchSampler, \
ComposeDataset, ChainDataset, WeightedRandomSampler
ComposeDataset, ChainDataset, WeightedRandomSampler, Subset, random_split
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册