dataset.py 41.3 KB
Newer Older
Q
Quleaf 已提交
1 2
# !/usr/bin/env python3
# Copyright (c) 2021 Institute for Quantum Computing, Baidu Inc. All Rights Reserved.
Q
Quleaf 已提交
3 4 5 6 7 8 9 10 11 12 13 14 15 16
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


Q
Quleaf 已提交
17 18
r"""
The source file of the dataset.
Q
Quleaf 已提交
19 20 21
"""

import random
Q
Quleaf 已提交
22 23
import math
from typing import Tuple, Union, Optional
Q
Quleaf 已提交
24 25
import numpy as np
import paddle
Q
Quleaf 已提交
26
import paddle.vision.transforms as transform
Q
Quleaf 已提交
27 28
from sklearn.model_selection import train_test_split
from sklearn import datasets
Q
Quleaf 已提交
29
from paddle_quantum.gate import RY, RZ, U3, CNOT, IQPEncoding, AmplitudeEncoding
Q
Quleaf 已提交
30 31

__all__ = [
Q
Quleaf 已提交
32
    "Dataset",
Q
Quleaf 已提交
33 34 35 36 37 38 39 40 41
    "VisionDataset",
    "SimpleDataset",
    "MNIST",
    "FashionMNIST",
    "Iris",
    "BreastCancer"
]

# data modes
Q
Quleaf 已提交
42 43
import paddle_quantum.gate

Q
Quleaf 已提交
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
DATAMODE_TRAIN = "train"
DATAMODE_TEST = "test"

# encoding methods
ANGLE_ENCODING = "angle_encoding"
AMPLITUDE_ENCODING = "amplitude_encoding"
PAULI_ROTATION_ENCODING = "pauli_rotation_encoding"

LINEAR_ENTANGLED_ENCODING = "linear_entangled_encoding"
REAL_ENTANGLED_ENCODING = "real_entangled_encoding"
COMPLEX_ENTANGLED_ENCODING = "complex_entangled_encoding"
IQP_ENCODING = "IQP_encoding"

# downscaling method
DOWNSCALINGMETHOD_PCA = "PCA"
DOWNSCALINGMETHOD_RESIZE = "resize"


def _normalize(x):
    r"""normalize vector ``x`` and the maximum will be pi. This is an internal function.
    """
    xx = np.abs(x)
    if xx.max() > 0:
        return x * np.pi / xx.max()
Q
Quleaf 已提交
68
    return x
Q
Quleaf 已提交
69 70 71 72 73 74 75 76 77 78 79 80 81


def _normalize_image(x):
    r"""normalize image vector ``x`` and the maximum will be pi. This is an internal function.
    """
    return x * np.pi / 256


def _crop(images, border):
    r"""crop ``images`` according to ``border``. This is an internal function.
    """
    new_images = []
    for i in range(len(images)):
Q
Quleaf 已提交
82
        size = int(np.sqrt(len(images[i])))
Q
Quleaf 已提交
83 84 85 86 87 88 89
        temp_image = images[i].reshape((size, size))
        temp_image = temp_image[border[0]:border[1], border[0]:border[1]]
        new_images.append(temp_image.flatten())
    return new_images


class Dataset(object):
Q
Quleaf 已提交
90
    r"""Base class for all datasets, integrating multiple quantum encoding methods.
Q
Quleaf 已提交
91
    """
Q
Quleaf 已提交
92

Q
Quleaf 已提交
93 94 95
    def __init__(self):
        return

Q
Quleaf 已提交
96 97 98 99 100
    def data2circuit(
            self, classical_data: list, encoding: str, num_qubits: int, can_describe_dimension: int, split_circuit: bool,
            return_state: bool, is_image: Optional[bool] = False
    ) -> list:
        r"""Encode the input ``classical data`` into quantum states using ``encoding``, where the classical data is truncated or filled with zero.
Q
Quleaf 已提交
101 102

        Args:
Q
Quleaf 已提交
103 104 105 106 107 108 109 110 111 112 113
            classical_data: vectors needed to encode, which have been already truncated or filled with zero to the length ``can_describe_dimension``
                For example, amplitude encoding can describe ``2 ** n`` dimension vectors.
            encoding: The encoding method.
            num_qubits: The number of qubits.
            can_describe_dimension: The dimension which the circuit can describe by ``encoding``. 
            split_circuit: Whether to split the circuit.
            return_state: Whether to return quantum state.
            is_image:Whether it is a picture, if it is a picture, the normalization method is not quite the same. Defaults to ``False``.
        
        Raises:
            Exception: Not support to return circuit in amplitude encoding.
Q
Quleaf 已提交
114 115

        Returns:
Q
Quleaf 已提交
116
            If ``return_state == True``, return encoded quantum state, otherwise return encoding circuits.
Q
Quleaf 已提交
117 118 119 120 121 122 123 124 125 126 127
        """
        quantum_states = classical_data.copy()
        quantum_circuits = classical_data.copy()
        if encoding == AMPLITUDE_ENCODING:
            # Not support to return circuit in amplitude encoding
            if return_state is False or split_circuit is True:
                raise Exception("Not support to return circuit in amplitude encoding")
            for i in range(len(classical_data)):
                x = paddle.to_tensor(_normalize(classical_data[i]))
                if is_image:
                    x = paddle.to_tensor(_normalize_image(classical_data[i]))
Q
Quleaf 已提交
128 129 130
                circuit = AmplitudeEncoding(qubits_idx='full', num_qubits=num_qubits)
                state = circuit(x)
                quantum_states[i] = state.data.numpy()
Q
Quleaf 已提交
131 132 133 134 135 136 137 138

        elif encoding == ANGLE_ENCODING:
            for i in range(len(classical_data)):
                one_block_param = 1 * num_qubits
                depth = int(can_describe_dimension / one_block_param)
                param = paddle.to_tensor(_normalize(classical_data[i]))
                if is_image:
                    param = paddle.to_tensor(_normalize_image(classical_data[i]))
Q
Quleaf 已提交
139 140
                param = paddle.reshape(param, (depth, num_qubits, 1))
                which_qubits = list(range(num_qubits))
Q
Quleaf 已提交
141 142 143
                if split_circuit:
                    quantum_circuits[i] = []
                    for repeat in range(depth):
Q
Quleaf 已提交
144
                        circuit = paddle_quantum.ansatz.Sequential()
Q
Quleaf 已提交
145
                        for k, q in enumerate(which_qubits):
Q
Quleaf 已提交
146
                            circuit.append(RY(qubits_idx=q, param=param[repeat][k][0]))
Q
Quleaf 已提交
147 148
                        quantum_circuits[i].append(circuit)
                else:
Q
Quleaf 已提交
149
                    circuit = paddle_quantum.ansatz.Sequential()
Q
Quleaf 已提交
150 151
                    for repeat in range(depth):
                        for k, q in enumerate(which_qubits):
Q
Quleaf 已提交
152 153 154
                            circuit.append(RY(qubits_idx=q, param=param[repeat][k][0]))
                    state_out = circuit(paddle_quantum.state.zero_state(num_qubits))
                    quantum_states[i] = state_out.data.numpy()
Q
Quleaf 已提交
155 156 157 158 159 160 161 162 163
                    quantum_circuits[i] = [circuit]

        elif encoding == IQP_ENCODING:
            for i in range(len(classical_data)):
                one_block_param = 1 * num_qubits
                depth = int(can_describe_dimension / one_block_param)
                param = paddle.to_tensor(_normalize(classical_data[i]))
                if is_image:
                    param = paddle.to_tensor(_normalize_image(classical_data[i]))
Q
Quleaf 已提交
164
                param = paddle.reshape(param, (depth, num_qubits))
Q
Quleaf 已提交
165 166 167
                if split_circuit:
                    quantum_circuits[i] = []
                    for repeat in range(depth):
Q
Quleaf 已提交
168 169
                        circuit = paddle_quantum.ansatz.Sequential()
                        s = []
Q
Quleaf 已提交
170
                        for k in range(num_qubits - 1):
Q
Quleaf 已提交
171
                            s.append([k, k + 1])
Q
Quleaf 已提交
172 173
                        # r 是 U 重复的次数
                        r = 1
Q
Quleaf 已提交
174
                        circuit.append(IQPEncoding(feature=param[repeat], num_repeat=r, qubits_idx=s))
Q
Quleaf 已提交
175 176
                        quantum_circuits[i].append(circuit)
                else:
Q
Quleaf 已提交
177
                    circuit = paddle_quantum.ansatz.Sequential()
Q
Quleaf 已提交
178
                    for repeat in range(depth):
Q
Quleaf 已提交
179
                        s = []
Q
Quleaf 已提交
180
                        for k in range(num_qubits - 1):
Q
Quleaf 已提交
181
                            s.append([k, k + 1])
Q
Quleaf 已提交
182 183
                        # r 是 U 重复的次数
                        r = 1
Q
Quleaf 已提交
184 185 186
                        circuit.append(IQPEncoding(feature=param[repeat], num_repeat=r, qubits_idx=s))
                    state_out = circuit(paddle_quantum.state.zero_state(num_qubits))
                    quantum_states[i] = state_out.data.numpy()
Q
Quleaf 已提交
187 188 189 190 191 192 193 194 195
                    quantum_circuits[i] = [circuit]

        elif encoding == PAULI_ROTATION_ENCODING:
            for i in range(len(classical_data)):
                one_block_param = 3 * num_qubits
                depth = int(can_describe_dimension / one_block_param)
                param = paddle.to_tensor(_normalize(classical_data[i]))
                if is_image:
                    param = paddle.to_tensor(_normalize_image(classical_data[i]))
Q
Quleaf 已提交
196 197
                param = paddle.reshape(param, (depth, num_qubits, 3))
                which_qubits = list(range(num_qubits))
Q
Quleaf 已提交
198 199 200
                if split_circuit:
                    quantum_circuits[i] = []
                    for repeat in range(depth):
Q
Quleaf 已提交
201
                        circuit = paddle_quantum.ansatz.Sequential()
Q
Quleaf 已提交
202
                        for k, q in enumerate(which_qubits):
Q
Quleaf 已提交
203 204 205
                            circuit.append(RY(q, param=param[repeat][k][0]))
                            circuit.append(RZ(q, param=param[repeat][k][1]))
                            circuit.append(RY(q, param=param[repeat][k][2]))
Q
Quleaf 已提交
206 207
                        quantum_circuits[i].append(circuit)
                else:
Q
Quleaf 已提交
208
                    circuit = paddle_quantum.ansatz.Sequential()
Q
Quleaf 已提交
209 210
                    for repeat in range(depth):
                        for k, q in enumerate(which_qubits):
Q
Quleaf 已提交
211 212 213 214 215
                            circuit.append(RY(q, param=param[repeat][k][0]))
                            circuit.append(RZ(q, param=param[repeat][k][1]))
                            circuit.append(RY(q, param=param[repeat][k][2]))
                    state_out = circuit(paddle_quantum.state.zero_state(num_qubits))
                    quantum_states[i] = state_out.data.numpy()
Q
Quleaf 已提交
216 217 218 219 220 221 222 223 224
                    quantum_circuits[i] = [circuit]

        elif encoding == LINEAR_ENTANGLED_ENCODING:
            for i in range(len(classical_data)):
                one_block_param = 2 * num_qubits
                depth = int(can_describe_dimension / one_block_param)
                param = paddle.to_tensor(_normalize(classical_data[i]))
                if is_image:
                    param = paddle.to_tensor(_normalize_image(classical_data[i]))
Q
Quleaf 已提交
225
                param = paddle.reshape(param, (depth, num_qubits, 2))
Q
Quleaf 已提交
226 227 228 229
                which_qubits = [k for k in range(num_qubits)]
                if split_circuit:
                    quantum_circuits[i] = []
                    for j in range(depth):
Q
Quleaf 已提交
230
                        circuit = paddle_quantum.ansatz.Sequential()
Q
Quleaf 已提交
231
                        for k, q in enumerate(which_qubits):
Q
Quleaf 已提交
232
                            circuit.append(RY(q, param=param[j][k][0]))
Q
Quleaf 已提交
233
                        for k in range(len(which_qubits) - 1):
Q
Quleaf 已提交
234
                            circuit.append(CNOT(qubits_idx=[which_qubits[k], which_qubits[k + 1]]))
Q
Quleaf 已提交
235
                        for k, q in enumerate(which_qubits):
Q
Quleaf 已提交
236
                            circuit.append(RZ(q, param=param[j][k][1]))
Q
Quleaf 已提交
237
                        for k in range(len(which_qubits) - 1):
Q
Quleaf 已提交
238
                            circuit.append(CNOT(qubits_idx=[which_qubits[k + 1], which_qubits[k]]))
Q
Quleaf 已提交
239 240
                        quantum_circuits[i].append(circuit)
                else:
Q
Quleaf 已提交
241
                    circuit = paddle_quantum.ansatz.Sequential()
Q
Quleaf 已提交
242 243
                    for j in range(depth):
                        for k, q in enumerate(which_qubits):
Q
Quleaf 已提交
244
                            circuit.append(RY(q, param=param[j][k][0]))
Q
Quleaf 已提交
245
                        for k in range(len(which_qubits) - 1):
Q
Quleaf 已提交
246
                            circuit.append(CNOT(qubits_idx=[which_qubits[k], which_qubits[k + 1]]))
Q
Quleaf 已提交
247
                        for k, q in enumerate(which_qubits):
Q
Quleaf 已提交
248
                            circuit.append(RZ(q, param=param[j][k][1]))
Q
Quleaf 已提交
249
                        for k in range(len(which_qubits) - 1):
Q
Quleaf 已提交
250 251 252
                            circuit.append(CNOT(qubits_idx=[which_qubits[k + 1], which_qubits[k]]))
                    state_out = circuit(paddle_quantum.state.zero_state(num_qubits))
                    quantum_states[i] = state_out.data.numpy()
Q
Quleaf 已提交
253 254 255 256 257 258 259 260 261
                    quantum_circuits[i] = [circuit]

        elif encoding == REAL_ENTANGLED_ENCODING:
            for i in range(len(classical_data)):
                one_block_param = 1 * num_qubits
                depth = int(can_describe_dimension / one_block_param)
                param = paddle.to_tensor(_normalize(classical_data[i]))
                if is_image:
                    param = paddle.to_tensor(_normalize_image(classical_data[i]))
Q
Quleaf 已提交
262
                param = paddle.reshape(param, (depth, num_qubits, 1))
Q
Quleaf 已提交
263 264 265 266
                which_qubits = [k for k in range(num_qubits)]
                if split_circuit:
                    quantum_circuits[i] = []
                    for repeat in range(depth):
Q
Quleaf 已提交
267
                        circuit = paddle_quantum.ansatz.Sequential()
Q
Quleaf 已提交
268
                        for k, q in enumerate(which_qubits):
Q
Quleaf 已提交
269
                            circuit.append(RY(q, param=param[repeat][k][0]))
Q
Quleaf 已提交
270
                        for k in range(len(which_qubits) - 1):
Q
Quleaf 已提交
271 272
                            circuit.append(CNOT(qubits_idx=[which_qubits[k], which_qubits[k + 1]]))
                        circuit.append(CNOT(qubits_idx=[which_qubits[-1], which_qubits[0]]))
Q
Quleaf 已提交
273 274
                        quantum_circuits[i].append(circuit)
                else:
Q
Quleaf 已提交
275
                    circuit = paddle_quantum.ansatz.Sequential()
Q
Quleaf 已提交
276 277
                    for repeat in range(depth):
                        for k, q in enumerate(which_qubits):
Q
Quleaf 已提交
278
                            circuit.append(RY(q, param=param[repeat][k][0]))
Q
Quleaf 已提交
279
                        for k in range(len(which_qubits) - 1):
Q
Quleaf 已提交
280 281 282 283
                            circuit.append(CNOT(qubits_idx=[which_qubits[k], which_qubits[k + 1]]))
                        circuit.append(CNOT(qubits_idx=[which_qubits[-1], which_qubits[0]]))
                    state_out = circuit(paddle_quantum.state.zero_state(num_qubits))
                    quantum_states[i] = state_out.data.numpy()
Q
Quleaf 已提交
284 285 286 287 288 289 290 291 292
                    quantum_circuits[i] = [circuit]

        elif encoding == COMPLEX_ENTANGLED_ENCODING:
            for i in range(len(classical_data)):
                one_block_param = 3 * num_qubits
                depth = int(can_describe_dimension / one_block_param)
                param = paddle.to_tensor(_normalize(classical_data[i]))
                if is_image:
                    param = paddle.to_tensor(_normalize_image(classical_data[i]))
Q
Quleaf 已提交
293
                param = paddle.reshape(param, (depth, num_qubits, 3))
Q
Quleaf 已提交
294 295 296 297
                which_qubits = [k for k in range(num_qubits)]
                if split_circuit:
                    quantum_circuits[i] = []
                    for repeat in range(depth):
Q
Quleaf 已提交
298
                        circuit = paddle_quantum.ansatz.Sequential()
Q
Quleaf 已提交
299
                        for k, q in enumerate(which_qubits):
Q
Quleaf 已提交
300
                            circuit.append(U3(q, param=param[repeat][k]))
Q
Quleaf 已提交
301 302
                            circuit.u3(param[repeat][k][0], param[repeat][k][1], param[repeat][k][2], q)
                        for k in range(len(which_qubits) - 1):
Q
Quleaf 已提交
303 304
                            circuit.append(CNOT(qubits_idx=[which_qubits[k], which_qubits[k + 1]]))
                        circuit.append(CNOT(qubits_idx=[which_qubits[-1], which_qubits[0]]))
Q
Quleaf 已提交
305 306
                        quantum_circuits[i].append(circuit)
                else:
Q
Quleaf 已提交
307
                    circuit = paddle_quantum.ansatz.Sequential()
Q
Quleaf 已提交
308 309
                    for repeat in range(depth):
                        for k, q in enumerate(which_qubits):
Q
Quleaf 已提交
310
                            circuit.append(U3(q, param=param[repeat][k]))
Q
Quleaf 已提交
311
                        for k in range(len(which_qubits) - 1):
Q
Quleaf 已提交
312 313 314 315
                            circuit.append(CNOT(qubits_idx=[which_qubits[k], which_qubits[k + 1]]))
                        circuit.append(CNOT(qubits_idx=[which_qubits[-1], which_qubits[0]]))
                    state_out = circuit(paddle_quantum.state.zero_state(num_qubits))
                    quantum_states[i] = state_out.data.numpy()
Q
Quleaf 已提交
316 317 318
                    quantum_circuits[i] = [circuit]
        return quantum_states, quantum_circuits

Q
Quleaf 已提交
319 320 321
    def filter_class(self, x: Union[list, np.ndarray], y: Union[list, np.ndarray], classes: list,
                     data_num: int, need_relabel: bool, seed: Optional[int] = 0) -> Tuple[list]:
        r"""Select ``data_num`` samples from ``x`` , ``y``, whose label is in ``classes``.
Q
Quleaf 已提交
322 323

        Args:
Q
Quleaf 已提交
324 325 326 327 328 329
            x: Training features.
            y: Training labels.
            classes: Classes needed to select.
            data_num: The number of data needed to select.
            need_relabel: Whether we need to relabel the labels to 0,1,2 for binary classification. For example ``[1,2]`` will be relabeled to ``[0,1]``. 
            seed: Random seed. Defaults to ``0``.
Q
Quleaf 已提交
330 331

        Returns:
Q
Quleaf 已提交
332
            contains elements
Q
Quleaf 已提交
333

Q
Quleaf 已提交
334 335
            - new_x: selected features.
            - new_y: selected labels corresponded to ``new_x``.
Q
Quleaf 已提交
336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365
        """
        new_x = []
        new_y = []
        if need_relabel:
            for i in range(len(x)):
                if y[i] in classes:
                    new_x.append(x[i])
                    new_y.append(classes.index(y[i]))
        else:
            for i in range(len(x)):
                if y[i] in classes:
                    new_x.append(x[i])
                    new_y.append(y[i])

        # sample to data_num randomly
        if data_num > 0 and data_num < len(new_x):
            random_index = [k for k in range(len(new_x))]
            random.seed(seed)
            random.shuffle(random_index)
            random_index = random_index[:data_num]
            filter_x = []
            filter_y = []
            for index in random_index:
                filter_x.append(new_x[index])
                filter_y.append(new_y[index])
            return filter_x, filter_y
        return new_x, new_y


class VisionDataset(Dataset):
Q
Quleaf 已提交
366
    r""" ``VisionDataset`` is the base class of all image datasets. By inheriting ``VisionDataset``, users can easily generate their own quantum data.
Q
Quleaf 已提交
367

Q
Quleaf 已提交
368 369
    Args:
        figure_size: The size of the figure.
Q
Quleaf 已提交
370 371
    """

Q
Quleaf 已提交
372
    def __init__(self, figure_size: int):
Q
Quleaf 已提交
373 374 375 376 377 378
        Dataset.__init__(self)
        self.figure_size = figure_size
        return

    # The encode function only needs to import images to form one-dimensional vector features.
    # The pre-processing of images (except dimensionality reduction) is completed before the import of features
Q
Quleaf 已提交
379 380 381 382
    def encode(self, feature: Union[list, np.ndarray], encoding: str, num_qubits: int, split_circuit: Optional[bool] = False,
               downscaling_method: Optional[str] = DOWNSCALINGMETHOD_RESIZE, target_dimension: Optional[int] = -1, 
               return_state: Optional[bool] = True, full_return: Optional[bool] = False) -> Tuple[paddle.Tensor, list, np.ndarray, np.ndarray]:
        r"""Encode ``feature`` into ``num_qubits`` qubits using ``encoding`` after downscaling to ``target_dimension``. ``feature`` is one-dimension image vectors.
Q
Quleaf 已提交
383 384

        Args:
Q
Quleaf 已提交
385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400
            feature: One-dimension image vectors which can be list or ndarray.
            encoding: ``angle_encoding`` denotes angle encoding, and one qubit encodes one number with a Ry gate. ``amplitude_encoding`` denotes amplitude encoding;
                      ``pauli_rotation_encoding`` denotes using SU(3) rotation gate. ``linear_entanglement_encoding``, ``real_entanglement_encoding`` , ``complex_entanglement_encoding`` 
                      and ``IQP_encoding`` encoding methods.
            num_qubits: Qubit number.
            split_circuit: Whether to split the circuits. If true, every layer of the encoding circuit will be split into a list. Defaults to ``False``.
            downscaling_method: Including ``PCA`` and ``resize``. Defaults to ``resize``.
            target_dimension: The dimension after downscaling. ``target_dimension`` is not allowed to surpass the figure size. Defaults to ``-1``.
            return_state: Whether to return quantum states. If it is ``False``, return quantum circuits. Defaults to ``True``.
            full_return: Whether to return ``quantum_image_states``, ``quantum_image_circuits``, ``original_images`` and ``classical_image_vectors``. Defaults to ``False``.

        Raises:
            Exception: PCA dimension should be less than figure size.
            Exception: Resize dimension should be a square.
            Exception: Downscaling methods can only be resize and PCA.
            Exception: Invalid encoding methods
Q
Quleaf 已提交
401 402

        Returns:
Q
Quleaf 已提交
403
            contain these elements
Q
Quleaf 已提交
404

Q
Quleaf 已提交
405 406 407 408
            - quantum_image_states: Quantum states, only ``full_return==True`` or ``return_state==True`` will return.
            - quantum_image_circuits: A list of circuits generating quantum states, only ``full_return==True`` or ``return_state==True`` will return.
            - original_images: One-dimension original vectors without any processing, only ``return_state==True`` will return.
            - classical_image_vectors: One-dimension original vectors after filling with zero, which are encoded to quantum states. only ``return_state==True`` will return.
Q
Quleaf 已提交
409 410 411 412 413 414 415 416 417 418 419 420 421 422 423
        """
        assert num_qubits > 0
        if encoding in [IQP_ENCODING, COMPLEX_ENTANGLED_ENCODING, REAL_ENTANGLED_ENCODING,
                        LINEAR_ENTANGLED_ENCODING]:
            assert num_qubits > 1

        if type(feature) == np.ndarray:
            feature = list(feature)

        # The first step: judge whether `target_dimension` is reasonable
        if target_dimension > -1:
            if downscaling_method == DOWNSCALINGMETHOD_PCA:
                if target_dimension > self.figure_size:
                    raise Exception("PCA dimension should be less than {}.".format(self.figure_size))
            elif downscaling_method == DOWNSCALINGMETHOD_RESIZE:
Q
Quleaf 已提交
424
                if int(np.sqrt(target_dimension)) ** 2 != target_dimension:  # not a square
Q
Quleaf 已提交
425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
                    raise Exception("Resize dimension should be a square.")
            else:
                raise Exception("Downscaling methods can only be resize and PCA.")
        else:
            if downscaling_method == DOWNSCALINGMETHOD_PCA:
                target_dimension = self.figure_size
            elif downscaling_method == DOWNSCALINGMETHOD_RESIZE:
                target_dimension = self.figure_size ** 2

        # The second step: calculate `can_describe_dimension`
        if encoding == AMPLITUDE_ENCODING:  # amplitude encoding, encoding 2^N-dimension feature
            self.can_describe_dimension = 2 ** num_qubits

        elif encoding == LINEAR_ENTANGLED_ENCODING:
            one_block_param = 2 * num_qubits
            self.can_describe_dimension = math.ceil(target_dimension / one_block_param) * one_block_param

        elif encoding in [REAL_ENTANGLED_ENCODING, ANGLE_ENCODING, IQP_ENCODING]:
            one_block_param = 1 * num_qubits
            self.can_describe_dimension = math.ceil(target_dimension / one_block_param) * one_block_param

        elif encoding in [COMPLEX_ENTANGLED_ENCODING, PAULI_ROTATION_ENCODING]:
            one_block_param = 3 * num_qubits
            self.can_describe_dimension = math.ceil(target_dimension / one_block_param) * one_block_param

        else:
            raise Exception("Invalid encoding methods!")
        self.dimension = target_dimension

        # The third step: download MNIST data from paddlepaddle and crop or fill the vector to ``can_describe_vector``
        self.original_images = np.array(feature)
        self.classical_image_vectors = feature.copy()

        # What need to mention if ``Resize`` needs uint8, but MNIST in paddle is float32, so we should change its type.
        if downscaling_method == DOWNSCALINGMETHOD_RESIZE:
            # iterating all items
            for i in range(len(self.classical_image_vectors)):
                cur_image = self.classical_image_vectors[i].astype(np.uint8)
Q
Quleaf 已提交
463
                new_size = int(np.sqrt(self.dimension))
Q
Quleaf 已提交
464 465 466 467 468 469 470
                cur_image = transform.resize(cur_image.reshape((self.figure_size, self.figure_size)),
                                             (new_size, new_size))
                self.classical_image_vectors[i] = cur_image.reshape(-1).astype(np.float64)  # now it is one-dimension

                if self.can_describe_dimension < len(self.classical_image_vectors[i]):
                    self.classical_image_vectors[i] = self.classical_image_vectors[i][:self.can_describe_dimension]
                else:
Q
Quleaf 已提交
471 472 473 474
                    self.classical_image_vectors[i] = np.append(
                        self.classical_image_vectors[i],
                        np.array([0.0] * (self.can_describe_dimension - len(self.classical_image_vectors[i])))
                    )
Q
Quleaf 已提交
475 476 477

        elif downscaling_method == DOWNSCALINGMETHOD_PCA:
            for i in range(len(self.classical_image_vectors)):
Q
Quleaf 已提交
478
                _, s, _ = np.linalg.svd(self.classical_image_vectors[i].reshape((self.figure_size, self.figure_size)))
Q
Quleaf 已提交
479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
                s = s[:self.dimension].astype(np.float64)
                if self.can_describe_dimension > self.dimension:
                    self.classical_image_vectors[i] = np.append(s, np.array(
                        [0.0] * (self.can_describe_dimension - self.dimension)))
                else:
                    self.classical_image_vectors[i] = s[:self.can_describe_dimension]

        # Step 4: Encode the data, which must be of float64 type(needed in paddle quantum)
        self.quantum_image_states, self.quantum_image_circuits = self.data2circuit(
            self.classical_image_vectors, encoding, num_qubits, self.can_describe_dimension, split_circuit,
            return_state, is_image=True)
        self.classical_image_vectors = np.array(self.classical_image_vectors)
        if return_state:
            self.quantum_image_states = paddle.to_tensor(np.array(self.quantum_image_states))  # transfer to tensor

        if full_return:
Q
Quleaf 已提交
495 496 497 498 499 500 501
            return (
                self.quantum_image_states, self.quantum_image_circuits,
                self.original_images, self.classical_image_vectors
            )
        if return_state:
            return self.quantum_image_states
        return self.quantum_image_circuits
Q
Quleaf 已提交
502 503 504


class MNIST(VisionDataset):
Q
Quleaf 已提交
505
    r"""MNIST quantum dataset. It inherits ``VisionDataset``.
Q
Quleaf 已提交
506

Q
Quleaf 已提交
507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524
    Args:
        mode: Data mode including ``train`` and ``test``.
        encoding: ``angle_encoding`` denotes angle encoding, and one qubit encodes one number with a Ry gate. ``amplitude_encoding`` denotes amplitude encoding;
                    ``pauli_rotation_encoding`` denotes using SU(3) rotation gate. ``linear_entanglement_encoding``, ``real_entanglement_encoding`` , ``complex_entanglement_encoding`` 
                    and ``IQP_encoding`` encoding methods.
        num_qubits: Qubit number.
        classes: Classes needed to classify, categories are indicated by numeric labels.
        data_num: Data number returned. Defaults to ``-1``.
        split_circuit: Whether to split the circuits. If True, every layer of the encoding circuit will be split into a list. Defaults to ``False``.
        downscaling_method: Including ``PCA`` and ``resize``. Defaults to ``resize``.
        target_dimension:  The dimension after downscaling, which is not allowed to surpass the figure size. Defaults to ``-1``.
        need_cropping: Whether needed to crop, If ``True``, ``image[0:27][0:27]`` will be cropped to ``image[4:24][4:24]``. Defaults to ``True``.
        need_relabel: Whether we need to relabel the labels to 0,1,2… for binary classification.For example [1,2] will be relabeled to [0,1] Defaults to ``True``.
        return_state: Whether to return quantum states. Defaults to ``True``.
        seed: Select random seed. Defaults to ``0``.

    Raises:
        Exception: Data mode can only be train and test.
Q
Quleaf 已提交
525 526 527

    """

Q
Quleaf 已提交
528 529 530 531 532 533
    def __init__(
            self, mode: str, encoding: str, num_qubits: int, classes : list, data_num: Optional[int]=-1, 
            split_circuit: Optional[bool]=False, downscaling_method: Optional[str] =DOWNSCALINGMETHOD_RESIZE, 
            target_dimension: Optional[int] = -1, need_cropping: Optional[bool] = True,
            need_relabel: Optional[bool] = True, return_state: Optional[bool] =True, seed: Optional[int]=0
    ) -> None:
Q
Quleaf 已提交
534 535 536 537 538 539 540 541
        VisionDataset.__init__(self, 28)

        if need_cropping:
            self.figure_size = 20

        # Download data from paddlepaddle
        if mode == DATAMODE_TRAIN:
            train_dataset = paddle.vision.datasets.MNIST(mode='train')
Q
Quleaf 已提交
542 543 544 545
            feature, self.labels = self.filter_class(
                train_dataset.images, train_dataset.labels,
                classes=classes, data_num=data_num, need_relabel=need_relabel, seed=seed
            )
Q
Quleaf 已提交
546 547 548 549 550 551
            if need_cropping:
                feature = _crop(feature, [4, 24])

        elif mode == DATAMODE_TEST:
            test_dataset = paddle.vision.datasets.MNIST(mode='test')
            # test_dataset.images is now a list of (784,1) shape
Q
Quleaf 已提交
552 553 554 555
            feature, self.labels = self.filter_class(
                test_dataset.images, test_dataset.labels,
                classes=classes, data_num=data_num, need_relabel=need_relabel, seed=seed
            )
Q
Quleaf 已提交
556 557 558 559 560 561 562 563
            if need_cropping:
                feature = _crop(feature, [4, 24])

        else:
            raise Exception("data mode can only be train and test.")

        # Start to encode
        self.quantum_image_states, self.quantum_image_circuits, self.original_images, self.classical_image_vectors = \
Q
Quleaf 已提交
564 565 566 567
            self.encode(
                feature, encoding, num_qubits, split_circuit,
                downscaling_method, target_dimension, return_state, True
            )
Q
Quleaf 已提交
568 569 570 571 572 573 574
        self.labels = np.array(self.labels)

    def __len__(self):
        return len(self.quantum_image_states)


class FashionMNIST(VisionDataset):
Q
Quleaf 已提交
575
    r"""FashionMNIST quantum dataset. It inherits ``VisionDataset``.
Q
Quleaf 已提交
576

Q
Quleaf 已提交
577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594
    Args:
        mode: Data mode including ``train`` and ``test``.
        encoding: ``angle_encoding`` denotes angle encoding, and one qubit encodes one number with a Ry gate. ``amplitude_encoding`` denotes amplitude encoding;
                    ``pauli_rotation_encoding`` denotes using SU(3) rotation gate. ``linear_entanglement_encoding``, ``real_entanglement_encoding`` , ``complex_entanglement_encoding`` 
                    and ``IQP_encoding`` encoding methods.
        num_qubits: Qubit number.
        classes: Classes needed to classify, categories are indicated by numeric labels.
        data_num: Data number returned. Defaults to ``-1``.
        split_circuit: Whether to split the circuits. If True, every layer of the encoding circuit will be split into a list. Defaults to ``False``.
        downscaling_method: Including ``PCA`` and ``resize``. Defaults to ``resize``.
        target_dimension:  The dimension after downscaling, which is not allowed to surpass the figure size. Defaults to ``-1``.
        need_cropping: Whether needed to crop, If ``True``, ``image[0:27][0:27]`` will be cropped to ``image[4:24][4:24]``. Defaults to ``True``.
        need_relabel: Whether we need to relabel the labels to 0,1,2… for binary classification.For example [1,2] will be relabeled to [0,1] Defaults to ``True``.
        return_state: Whether to return quantum states. Defaults to ``True``.
        seed: Select random seed. Defaults to ``0``.

    Raises:
        Exception: Data mode can only be train and test.
Q
Quleaf 已提交
595 596
    """

Q
Quleaf 已提交
597 598 599 600 601 602 603
    def __init__(
            self, mode: str, encoding: str, num_qubits: int, classes: list, data_num: Optional[int] = -1, 
            split_circuit: Optional[bool] = False, downscaling_method: Optional[str] = DOWNSCALINGMETHOD_RESIZE, 
            target_dimension: Optional[int] = -1, need_relabel: Optional[bool] = True,
            return_state: Optional[bool] = True, seed: Optional[int] = 0) -> None:

        r"""Constructor
Q
Quleaf 已提交
604 605 606 607 608 609 610

        """
        VisionDataset.__init__(self, 28)

        # Download data from paddlepaddle
        if mode == DATAMODE_TRAIN:
            train_dataset = paddle.vision.datasets.FashionMNIST(mode='train')
Q
Quleaf 已提交
611 612 613 614
            feature, self.labels = self.filter_class(
                train_dataset.images, train_dataset.labels,
                classes=classes,data_num=data_num, need_relabel=need_relabel, seed=seed
            )
Q
Quleaf 已提交
615 616 617 618

        elif mode == DATAMODE_TEST:
            test_dataset = paddle.vision.datasets.FashionMNIST(mode='test')
            # test_dataset.images is now a list of (784,1) shape
Q
Quleaf 已提交
619 620 621 622
            feature, self.labels = self.filter_class(
                test_dataset.images, test_dataset.labels,
                classes=classes,data_num=data_num, need_relabel=need_relabel, seed=seed
            )
Q
Quleaf 已提交
623 624 625 626 627 628

        else:
            raise Exception("data mode can only be train and test.")

        # Start to encode
        self.quantum_image_states, self.quantum_image_circuits, self.original_images, self.classical_image_vectors = \
Q
Quleaf 已提交
629 630 631 632
            self.encode(
                feature, encoding, num_qubits, split_circuit, downscaling_method, target_dimension,
                return_state, True
            )
Q
Quleaf 已提交
633 634 635 636 637 638 639
        self.labels = np.array(self.labels)

    def __len__(self):
        return len(self.quantum_image_states)


class SimpleDataset(Dataset):
Q
Quleaf 已提交
640
    r"""For simple dataset that does not require dimension reduction. You can inherit ``SimpleDataset`` to generate quantum states from your classical datasets.
Q
Quleaf 已提交
641

Q
Quleaf 已提交
642 643
    Args:
        dimension: Dimension of encoding data.
Q
Quleaf 已提交
644 645 646

    """

Q
Quleaf 已提交
647
    def __init__(self, dimension: int):
Q
Quleaf 已提交
648 649 650 651
        Dataset.__init__(self)
        self.dimension = dimension
        return

Q
Quleaf 已提交
652 653 654
    def encode(self, feature: Union[list, np.ndarray], encoding: str, num_qubits: int, 
               return_state: Optional[bool] = True, full_return: Optional[bool] = False) -> Tuple[np.ndarray, list, np.ndarray, np.ndarray]:
        r"""Encode ``feature`` with ``num_qubits`` qubits by ``encoding``.
Q
Quleaf 已提交
655 656

        Args:
Q
Quleaf 已提交
657 658 659 660 661 662 663 664 665 666
            feature: Features needed to encode.
            encoding: Encoding methods.
            num_qubits: Qubit number.
            return_state: Whether to return quantum states. Defaults to ``True``.
            full_return: Whether to return quantum_states, quantum_circuits, origin_feature and feature. Defaults to ``False``.

        Raises:
            Exception: Invalid type of feature.
            Exception: Invalid encoding methods.
            Exception: The qubit number is not enough to encode the features.
Q
Quleaf 已提交
667 668

        Returns:
Q
Quleaf 已提交
669
            contain these element
Q
Quleaf 已提交
670

Q
Quleaf 已提交
671 672 673 674 675
            - quantum_states: Quantum states, only ``full_return==True`` or ``return_state==True`` will return;
            - quantum_circuits: A list of circuits generating quantum states, only ``full_return==True`` or ``return_state==True`` will return;
            - origin_feature: One-dimension original vectors without any processing, only ``return_state==True`` will return
            - feature: One-dimension original vectors after filling with zero, which are encoded to quantum states. only ``return_state==True`` will return.
        """
Q
Quleaf 已提交
676
        assert num_qubits > 0
Q
Quleaf 已提交
677 678 679 680 681
        encoding_list = [
            IQP_ENCODING, COMPLEX_ENTANGLED_ENCODING,
            REAL_ENTANGLED_ENCODING, LINEAR_ENTANGLED_ENCODING
        ]
        if encoding in encoding_list:
Q
Quleaf 已提交
682 683
            assert num_qubits > 1

Q
Quleaf 已提交
684
        if isinstance(feature, np.ndarray):
Q
Quleaf 已提交
685
            self.feature = list(feature)
Q
Quleaf 已提交
686
        elif isinstance(feature, list):
Q
Quleaf 已提交
687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718
            self.feature = feature
        else:
            raise Exception("invalid type of feature")

        self.origin_feature = np.array(feature)

        # The first step, calculate ``self.can_describe_dimension``, and judge whether the qubit number is small
        if encoding == AMPLITUDE_ENCODING:  # amplitude encoding, encoding 2^N-dimension feature
            self.can_describe_dimension = 2 ** num_qubits
        # For these three kinds of entanglement encoding: lay these parameters block by block.
        elif encoding == LINEAR_ENTANGLED_ENCODING:
            one_block_param = 2 * num_qubits
            self.can_describe_dimension = math.ceil(self.dimension / one_block_param) * one_block_param

        elif encoding in [REAL_ENTANGLED_ENCODING, IQP_ENCODING, ANGLE_ENCODING]:
            one_block_param = 1 * num_qubits
            self.can_describe_dimension = math.ceil(self.dimension / one_block_param) * one_block_param

        elif encoding in [COMPLEX_ENTANGLED_ENCODING, PAULI_ROTATION_ENCODING]:
            one_block_param = 3 * num_qubits
            self.can_describe_dimension = math.ceil(self.dimension / one_block_param) * one_block_param

        else:
            raise Exception("Invalid encoding methods!")

        if self.can_describe_dimension < self.dimension:
            raise Exception("The qubit number is not enough to encode the features.")

        # The second step: fill the vector to ``can_describe_dimension`` using zero
        for i in range(len(self.feature)):
            self.feature[i] = self.feature[i].reshape(-1).astype(
                np.float64)  # now self.images[i] is a numpy with (new_size*new_size,1) shape
Q
Quleaf 已提交
719 720 721 722
            self.feature[i] = np.append(
                self.feature[i],
                np.array([0.0] * (self.can_describe_dimension - self.dimension))
            )  # now self.images[i] is filled to ``self.can_describe_dimension``
Q
Quleaf 已提交
723 724 725 726

        # Step 3: Encode the data, which must be of float64 type(needed in paddle quantum)
        self.quantum_states, self.quantum_circuits = self.data2circuit(
            self.feature, encoding, num_qubits, self.can_describe_dimension, False,  # split_circuit=False
Q
Quleaf 已提交
727 728
            return_state
        )
Q
Quleaf 已提交
729 730 731 732 733 734

        self.feature = np.array(self.feature)
        self.quantum_states = np.array(self.quantum_states)

        if full_return:
            return self.quantum_states, self.quantum_circuits, self.origin_feature, self.feature
Q
Quleaf 已提交
735 736 737
        if return_state:
            return self.quantum_states
        return self.quantum_circuits
Q
Quleaf 已提交
738 739 740


class Iris(SimpleDataset):
Q
Quleaf 已提交
741
    r"""Iris dataset
Q
Quleaf 已提交
742

Q
Quleaf 已提交
743 744 745 746 747 748 749 750 751 752 753 754
    Args:
        encoding: ``angle_encoding`` denotes angle encoding, and one qubit encodes one number with a Ry gate. ``amplitude_encoding`` denotes amplitude encoding;
                    ``pauli_rotation_encoding`` denotes using SU(3) rotation gate. ``linear_entanglement_encoding``, ``real_entanglement_encoding`` , ``complex_entanglement_encoding`` 
                    and ``IQP_encoding`` encoding methods.
        num_qubits: Qubit number.
        classes: Classes needed to classify, categories are indicated by numeric labels.
        test_rate: The proportion of the testing dataset. Defaults to ``0.2``.
        need_relabel: Whether we need to relabel the labels to 0,1,2… for binary classification.For example [1,2] will be relabeled to [0,1]. Defaults to ``True``.
        return_state: Whether to return quantum states. Defaults to ``True``.
        seed: Select random seed. Defaults to ``0``.
    
    """
Q
Quleaf 已提交
755

Q
Quleaf 已提交
756 757
    def __init__(self, encoding: str, num_qubits: int, classes: list, test_rate: Optional[float] = 0.2, 
                 need_relabel: Optional[bool] = True, return_state: Optional[bool] = True, seed: Optional[int] = 0) -> None:
Q
Quleaf 已提交
758 759 760 761 762
        SimpleDataset.__init__(self, dimension=4)

        # Download data from scikit-learn
        iris = datasets.load_iris()
        self.dimension = 4  # dimension of Iris dataset
Q
Quleaf 已提交
763 764 765
        feature, self.target = self.filter_class(
            iris.data, iris.target, classes, -1,need_relabel
        )  # here -1 means all data
Q
Quleaf 已提交
766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789
        self.target = np.array(self.target)

        # Start to encode
        self.quantum_states, self.quantum_circuits, self.origin_feature, self.feature = \
            self.encode(feature, encoding, num_qubits, return_state, True)

        # Divide training and testing dataset
        self.train_x, self.test_x, self.train_y, self.test_y = \
            train_test_split(self.quantum_states, self.target, test_size=test_rate,
                             random_state=seed)

        self.train_circuits, self.test_circuits, temp1, temp2 = \
            train_test_split(self.quantum_circuits, self.target, test_size=test_rate,
                             random_state=seed)

        self.origin_train_x, self.origin_test_x, temp1, temp2 = \
            train_test_split(self.origin_feature, self.target, test_size=test_rate,
                             random_state=seed)
        if return_state:
            self.train_x = paddle.to_tensor(self.train_x)
            self.test_x = paddle.to_tensor(self.test_x)


class BreastCancer(SimpleDataset):
Q
Quleaf 已提交
790
    r"""BreastCancer quantum dataset.
Q
Quleaf 已提交
791

Q
Quleaf 已提交
792 793 794 795 796 797 798 799
    Args:
        encoding: ``angle_encoding`` denotes angle encoding, and one qubit encodes one number with a Ry gate. ``amplitude_encoding`` denotes amplitude encoding;
                    ``pauli_rotation_encoding`` denotes using SU(3) rotation gate. ``linear_entanglement_encoding``, ``real_entanglement_encoding`` , ``complex_entanglement_encoding`` 
                    and ``IQP_encoding`` encoding methods.
        num_qubits: Qubit number.
        test_rate:The proportion of the testing dataset. Defaults to ``0.2``.
        return_state: Whether to return quantum states. Defaults to ``True``.
        seed: Select random seed. Defaults to ``0``.
Q
Quleaf 已提交
800

Q
Quleaf 已提交
801 802 803 804
    """

    def __init__(self, encoding: str, num_qubits: int, test_rate: Optional[float] =0.2, 
                 return_state: Optional[bool] = True, seed: Optional[int] = 0) -> None:
Q
Quleaf 已提交
805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820
        SimpleDataset.__init__(self, dimension=30)  # The dimension is 30
        self.dimension = 30

        # Download data from scikit-learn
        breast_cancer = datasets.load_breast_cancer()
        feature = breast_cancer["data"]
        self.target = breast_cancer["target"]

        self.target = np.array(self.target)

        # Start to encode
        self.quantum_states, self.quantum_circuits, self.origin_feature, self.feature = \
            self.encode(feature, encoding, num_qubits, return_state, True)

        # Divide training and testing dataset
        self.train_x, self.test_x, self.train_y, self.test_y = \
Q
Quleaf 已提交
821
            train_test_split(self.quantum_states, self.target, test_size=test_rate, random_state=seed)
Q
Quleaf 已提交
822 823

        self.train_circuits, self.test_circuits, temp1, temp2 = \
Q
Quleaf 已提交
824
            train_test_split(self.quantum_circuits, self.target, test_size=test_rate, random_state=seed)
Q
Quleaf 已提交
825 826

        self.origin_train_x, self.origin_test_x, temp1, temp2 = \
Q
Quleaf 已提交
827
            train_test_split(self.origin_feature, self.target, test_size=test_rate, random_state=seed)
Q
Quleaf 已提交
828 829 830
        if return_state:
            self.train_x = paddle.to_tensor(self.train_x)
            self.test_x = paddle.to_tensor(self.test_x)