diff --git a/build/docker/Dockerfile b/build/docker/Dockerfile index 7a1cfff59fc624e97953f4ab52af1afa9c1e1f6b..2e624eab9b046d809d5c2560ffd05cd119940294 100644 --- a/build/docker/Dockerfile +++ b/build/docker/Dockerfile @@ -50,6 +50,13 @@ RUN gpg2 --keyserver hkp://keys.gnupg.net --recv-keys 409B6B1796C275462A17031138 ENV PATH "$PATH:/usr/local/rvm/bin" RUN rvm install 2.4.1 +# Install nodejs and yarn +RUN curl -sL https://deb.nodesource.com/setup_10.x | bash - +RUN apt-get install -y nodejs +RUN curl -sS https://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add - +RUN echo "deb https://dl.yarnpkg.com/debian/ stable main" | tee /etc/apt/sources.list.d/yarn.list +RUN apt-get update && apt-get install yarn + # Install PIP and PDoc RUN wget https://bootstrap.pypa.io/get-pip.py && python get-pip.py RUN pip install pdoc diff --git a/pom.xml b/pom.xml index 6369a10c6856f364c0f902708f485add9a2dcdac..ccc50cd29809756492c37e41e3ea84af195e3242 100644 --- a/pom.xml +++ b/pom.xml @@ -1005,6 +1005,7 @@ flexible messaging model and an intuitive client API. site/_sass/font-awesome/** site/fonts/** site/img/** + site2/** generated-site/** .github/*.md **/.idea/* diff --git a/site2/.gitignore b/site2/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..5395ea795d62b04c6645f5746b9cc667d26a07bb --- /dev/null +++ b/site2/.gitignore @@ -0,0 +1,12 @@ +.DS_Store + +node_modules + +lib/core/metadata.js +lib/core/MetadataBlog.js + +website/translated_docs +website/build/ +website/yarn.lock +website/node_modules +website/i18n/* diff --git a/site2/README.md b/site2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3333f53d6c571fb97ca994ad28d63bb9593a0add --- /dev/null +++ b/site2/README.md @@ -0,0 +1,24 @@ + + +# The Pulsar website and documentation + +This `README` is basically the meta-documentation for the Pulsar website and documentation. Here you'll find instructions on running the site locally + +## Tools + +Framework [Docusaurus](https://docusaurus.io/). + +Ensure you have the latest version of [Node](https://nodejs.org/en/download/) installed. We also recommend you install [Yarn](https://yarnpkg.com/en/docs/install) as well. + +> You have to be on Node >= 8.x and Yarn >= 1.5. + + +## Running the site locally + +To run the site locally: + +```bash +cd website +yarn install +yarn start +``` diff --git a/site2/docs/adaptors-kafka.md b/site2/docs/adaptors-kafka.md new file mode 100644 index 0000000000000000000000000000000000000000..a3f309db53a8390828af899557f3927fc2b3bcde --- /dev/null +++ b/site2/docs/adaptors-kafka.md @@ -0,0 +1,259 @@ +--- +id: adaptors-kafka +title: Pulsar adaptor for Apache Kafka +sidebar_label: Kafka client wrappter +--- + + +Pulsar provides an easy option for applications that are currently written using the [Apache Kafka](http://kafka.apache.org) Java client API. + +## Using the Pulsar Kafka compatibility wrapper + +In an existing application, change the regular Kafka client dependency and replace it with the Pulsar Kafka wrapper. Remove: + +```xml + + org.apache.kafka + kakfa-clients + 0.10.2.1 + +``` + +Then include this dependency for the Pulsar Kafka wrapper: + +```xml + + org.apache.pulsar + pulsar-client-kafka + pulsar:version + +``` + +With the new dependency, the existing code should work without any changes. The only +thing that needs to be adjusted is the configuration, to make sure to point the +producers and consumers to Pulsar service rather than Kafka and to use a particular +Pulsar topic. + +## Using the Pulsar Kafka compatibility wrapper together with existing kafka client. + +When migrating from Kafka to Pulsar, the application might have to use the original kafka client +and the pulsar kafka wrapper together during migration. Then you should consider using the +unshaded pulsar kafka client wrapper. + +```xml + + org.apache.pulsar + pulsar-client-kafka-original + pulsar:version + +``` + +When using this dependency, you need to construct producer using `org.apache.kafka.clients.producer.PulsarKafkaProducer` +instead of `org.apache.kafka.clients.producer.KafkaProducer` and `org.apache.kafka.clients.producer.PulsarKafkaConsumer` for consumers. + +## Producer example + +```java +// Topic needs to be a regular Pulsar topic +String topic = "persistent://public/default/my-topic"; + +Properties props = new Properties(); +// Point to a Pulsar service +props.put("bootstrap.servers", "pulsar://localhost:6650"); + +props.put("key.serializer", IntegerSerializer.class.getName()); +props.put("value.serializer", StringSerializer.class.getName()); + +Producer producer = new KafkaProducer<>(props); + +for (int i = 0; i < 10; i++) { + producer.send(new ProducerRecord(topic, i, "hello-" + i)); + log.info("Message {} sent successfully", i); +} + +producer.close(); +``` + +## Consumer example + +```java +String topic = "persistent://public/default/my-topic"; + +Properties props = new Properties(); +// Point to a Pulsar service +props.put("bootstrap.servers", "pulsar://localhost:6650"); +props.put("group.id", "my-subscription-name"); +props.put("enable.auto.commit", "false"); +props.put("key.deserializer", IntegerDeserializer.class.getName()); +props.put("value.deserializer", StringDeserializer.class.getName()); + +Consumer consumer = new KafkaConsumer<>(props); +consumer.subscribe(Arrays.asList(topic)); + +while (true) { + ConsumerRecords records = consumer.poll(100); + records.forEach(record -> { + log.info("Received record: {}", record); + }); + + // Commit last offset + consumer.commitSync(); +} +``` + +## Complete Examples + +You can find the complete producer and consumer examples +[here](https://github.com/apache/incubator-pulsar/tree/master/pulsar-client-kafka-compat/pulsar-client-kafka-tests/src/test/java/org/apache/pulsar/client/kafka/compat/examples). + +## Compatibility matrix + +Currently the Pulsar Kafka wrapper supports most of the operations offered by the Kafka API. + +#### Producer + +APIs: + +| Producer Method | Supported | Notes | +|:------------------------------------------------------------------------------|:----------|:-------------------------------------------------------------------------| +| `Future send(ProducerRecord record)` | Yes | Currently no support for explicitly set the partition id when publishing | +| `Future send(ProducerRecord record, Callback callback)` | Yes | | +| `void flush()` | Yes | | +| `List partitionsFor(String topic)` | No | | +| `Map metrics()` | No | | +| `void close()` | Yes | | +| `void close(long timeout, TimeUnit unit)` | Yes | | + +Properties: + +| Config property | Supported | Notes | +|:----------------------------------------|:----------|:------------------------------------------------------------------------------| +| `acks` | Ignored | Durability and quorum writes are configured at the namespace level | +| `batch.size` | Ignored | | +| `block.on.buffer.full` | Yes | If true it will block producer, otherwise give error | +| `bootstrap.servers` | Yes | Needs to point to a single Pulsar service URL | +| `buffer.memory` | Ignored | | +| `client.id` | Ignored | | +| `compression.type` | Yes | Allows `gzip` and `lz4`. No `snappy`. | +| `connections.max.idle.ms` | Ignored | | +| `interceptor.classes` | Ignored | | +| `key.serializer` | Yes | | +| `linger.ms` | Yes | Controls the group commit time when batching messages | +| `max.block.ms` | Ignored | | +| `max.in.flight.requests.per.connection` | Ignored | In Pulsar ordering is maintained even with multiple requests in flight | +| `max.request.size` | Ignored | | +| `metric.reporters` | Ignored | | +| `metrics.num.samples` | Ignored | | +| `metrics.sample.window.ms` | Ignored | | +| `partitioner.class` | Ignored | | +| `receive.buffer.bytes` | Ignored | | +| `reconnect.backoff.ms` | Ignored | | +| `request.timeout.ms` | Ignored | | +| `retries` | Ignored | Pulsar client retries with exponential backoff until the send timeout expires | +| `send.buffer.bytes` | Ignored | | +| `timeout.ms` | Ignored | | +| `value.serializer` | Yes | | + + +#### Consumer + +APIs: + +| Consumer Method | Supported | Notes | +|:--------------------------------------------------------------------------------------------------------|:----------|:------| +| `Set assignment()` | No | | +| `Set subscription()` | Yes | | +| `void subscribe(Collection topics)` | Yes | | +| `void subscribe(Collection topics, ConsumerRebalanceListener callback)` | No | | +| `void assign(Collection partitions)` | No | | +| `void subscribe(Pattern pattern, ConsumerRebalanceListener callback)` | No | | +| `void unsubscribe()` | Yes | | +| `ConsumerRecords poll(long timeoutMillis)` | Yes | | +| `void commitSync()` | Yes | | +| `void commitSync(Map offsets)` | Yes | | +| `void commitAsync()` | Yes | | +| `void commitAsync(OffsetCommitCallback callback)` | Yes | | +| `void commitAsync(Map offsets, OffsetCommitCallback callback)` | Yes | | +| `void seek(TopicPartition partition, long offset)` | Yes | | +| `void seekToBeginning(Collection partitions)` | Yes | | +| `void seekToEnd(Collection partitions)` | Yes | | +| `long position(TopicPartition partition)` | Yes | | +| `OffsetAndMetadata committed(TopicPartition partition)` | Yes | | +| `Map metrics()` | No | | +| `List partitionsFor(String topic)` | No | | +| `Map> listTopics()` | No | | +| `Set paused()` | No | | +| `void pause(Collection partitions)` | No | | +| `void resume(Collection partitions)` | No | | +| `Map offsetsForTimes(Map timestampsToSearch)` | No | | +| `Map beginningOffsets(Collection partitions)` | No | | +| `Map endOffsets(Collection partitions)` | No | | +| `void close()` | Yes | | +| `void close(long timeout, TimeUnit unit)` | Yes | | +| `void wakeup()` | No | | + +Properties: + +| Config property | Supported | Notes | +|:--------------------------------|:----------|:------------------------------------------------------| +| `group.id` | Yes | Maps to a Pulsar subscription name | +| `max.poll.records` | Ignored | | +| `max.poll.interval.ms` | Ignored | Messages are "pushed" from broker | +| `session.timeout.ms` | Ignored | | +| `heartbeat.interval.ms` | Ignored | | +| `bootstrap.servers` | Yes | Needs to point to a single Pulsar service URL | +| `enable.auto.commit` | Yes | | +| `auto.commit.interval.ms` | Ignored | With auto-commit, acks are sent immediately to broker | +| `partition.assignment.strategy` | Ignored | | +| `auto.offset.reset` | Ignored | | +| `fetch.min.bytes` | Ignored | | +| `fetch.max.bytes` | Ignored | | +| `fetch.max.wait.ms` | Ignored | | +| `metadata.max.age.ms` | Ignored | | +| `max.partition.fetch.bytes` | Ignored | | +| `send.buffer.bytes` | Ignored | | +| `receive.buffer.bytes` | Ignored | | +| `client.id` | Ignored | | + + +## Custom Pulsar configurations + +You can configure Pulsar authentication provider directly from the Kafka properties. + +### Pulsar client properties: + +| Config property | Default | Notes | +|:---------------------------------------|:--------|:---------------------------------------------------------------------------------------| +| `pulsar.authentication.class` | | Configure to auth provider. Eg. `org.apache.pulsar.client.impl.auth.AuthenticationTls` | +| [`pulsar.use.tls`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setUseTls-boolean-) | `false` | Enable TLS transport encryption | +| [`pulsar.tls.trust.certs.file.path`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setTlsTrustCertsFilePath-java.lang.String-) | | Path for the TLS trust certificate store | +| [`pulsar.tls.allow.insecure.connection`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setTlsAllowInsecureConnection-boolean-) | `false` | Accept self-signed certificates from brokers | +| [`pulsar.operation.timeout.ms`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setOperationTimeout-int-java.util.concurrent.TimeUnit-) | `30000` | General operations timeout | +| [`pulsar.stats.interval.seconds`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setStatsInterval-long-java.util.concurrent.TimeUnit-) | `60` | Pulsar client lib stats printing interval | +| [`pulsar.num.io.threads`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setIoThreads-int-) | `1` | Number of Netty IO threads to use | +| [`pulsar.connections.per.broker`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setConnectionsPerBroker-int-) | `1` | Max number of connection to open to each broker | +| [`pulsar.use.tcp.nodelay`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setUseTcpNoDelay-boolean-) | `true` | TCP no-delay | +| [`pulsar.concurrent.lookup.requests`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setConcurrentLookupRequest-int-) | `50000` | Max number of concurrent topic lookups | +| [`pulsar.max.number.rejected.request.per.connection`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setMaxNumberOfRejectedRequestPerConnection-int-) | `50` | Threshold of errors to forcefully close a connection | + + +### Pulsar producer properties + +| Config property | Default | Notes | +|:---------------------------------------|:--------|:---------------------------------------------------------------------------------------| +| [`pulsar.producer.name`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setProducerName-java.lang.String-) | | Specify producer name | +| [`pulsar.producer.initial.sequence.id`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setInitialSequenceId-long-) | | Specify baseline for sequence id for this producer | +| [`pulsar.producer.max.pending.messages`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setMaxPendingMessages-int-) | `1000` | Set the max size of the queue holding the messages pending to receive an acknowledgment from the broker. | +| [`pulsar.producer.max.pending.messages.across.partitions`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setMaxPendingMessagesAcrossPartitions-int-) | `50000` | Set the number of max pending messages across all the partitions | +| [`pulsar.producer.batching.enabled`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setBatchingEnabled-boolean-) | `true` | Control whether automatic batching of messages is enabled for the producer | +| [`pulsar.producer.batching.max.messages`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setBatchingMaxMessages-int-) | `1000` | The maximum number of messages permitted in a batch | + + +### Pulsar consumer Properties + +| Config property | Default | Notes | +|:---------------------------------------|:--------|:---------------------------------------------------------------------------------------| +| [`pulsar.consumer.name`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerConfiguration.html#setConsumerName-java.lang.String-) | | Set the consumer name | +| [`pulsar.consumer.receiver.queue.size`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerConfiguration.html#setReceiverQueueSize-int-) | 1000 | Sets the size of the consumer receive queue | +| [`pulsar.consumer.total.receiver.queue.size.across.partitions`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerConfiguration.html#setMaxTotalReceiverQueueSizeAcrossPartitions-int-) | 50000 | Set the max total receiver queue size across partitons | + diff --git a/site2/docs/adaptors-spark.md b/site2/docs/adaptors-spark.md new file mode 100644 index 0000000000000000000000000000000000000000..1cecaa9dfe9177bfc8cc469733208113c66a205b --- /dev/null +++ b/site2/docs/adaptors-spark.md @@ -0,0 +1,66 @@ +--- +id: adaptors-spark +title: Pulsar adaptor for Apache Spark +sidebar_label: Apache Spark +--- + +The Spark Streaming receiver for Pulsar is a custom receiver that enables Apache [Spark Streaming](https://spark.apache.org/streaming/) to receive data from Pulsar. + +An application can receive data in [Resilient Distributed Dataset](https://spark.apache.org/docs/latest/programming-guide.html#resilient-distributed-datasets-rdds) (RDD) format via the Spark Streaming Pulsar receiver and can process it in a variety of ways. + +## Prerequisites + +To use the receiver, include a dependency for the `pulsar-spark` library in your Java configuration. + +### Maven + +If you're using Maven, add this to your `pom.xml`: + +```xml + +pulsar:version + + + + org.apache.pulsar + pulsar-spark + ${pulsar.version} + +``` + +### Gradle + +If you're using Gradle, add this to your `build.gradle` file: + +```groovy +def pulsarVersion = "pulsar:version" + +dependencies { + compile group: 'org.apache.pulsar', name: 'pulsar-spark', version: pulsarVersion +} +``` + +## Usage + +Pass an instance of `SparkStreamingPulsarReceiver` to the `receiverStream` method in `JavaStreamingContext`: + +```java +SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("pulsar-spark"); +JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5)); + +ClientConfiguration clientConf = new ClientConfiguration(); +ConsumerConfiguration consConf = new ConsumerConfiguration(); +String url = "pulsar://localhost:6650/"; +String topic = "persistent://public/default/topic1"; +String subs = "sub1"; + +JavaReceiverInputDStream msgs = jssc + .receiverStream(new SparkStreamingPulsarReceiver(clientConf, consConf, url, topic, subs)); +``` + + +## Example + +You can find a complete example [here](https://github.com/apache/incubator-pulsar/tree/master/pulsar-spark/src/test/java/org/apache/pulsar/spark/example/SparkStreamingPulsarReceiverExample.java). +In this example, the number of messages which contain the string "Pulsar" in received messages is counted. + diff --git a/site2/docs/adaptors-storm.md b/site2/docs/adaptors-storm.md new file mode 100644 index 0000000000000000000000000000000000000000..2c4e795c4d2afc67353bcc0d5404b97cf23b8721 --- /dev/null +++ b/site2/docs/adaptors-storm.md @@ -0,0 +1,104 @@ +--- +id: adaptors-storm +title: Pulsar adaptor for Apache Storm +sidebar_label: Apache Storm +--- + +Pulsar Storm is an adaptor for integrating with [Apache Storm](http://storm.apache.org/) topologies. It provides core Storm implementations for sending and receiving data. + +An application can inject data into a Storm topology via a generic Pulsar spout, as well as consume data from a Storm topology via a generic Pulsar bolt. + +## Using the Pulsar Storm Adaptor + +Include dependency for Pulsar Storm Adaptor: + +```xml + + org.apache.pulsar + pulsar-storm + ${pulsar.version} + +``` + +## Pulsar Spout + +The Pulsar Spout allows for the data published on a topic to be consumed by a Storm topology. It emits a Storm tuple based on the message received and the `MessageToValuesMapper` provided by the client. + +The tuples that fail to be processed by the downstream bolts will be re-injected by the spout with an exponential backoff, within a configurable timeout (the default is 60 seconds) or a configurable number of retries, whichever comes first, after which it is acknowledged by the consumer. Here's an example construction of a spout: + +```java +// Configure a Pulsar Client +ClientConfiguration clientConf = new ClientConfiguration(); + +// Configure a Pulsar Consumer +ConsumerConfiguration consumerConf = new ConsumerConfiguration(); + +@SuppressWarnings("serial") +MessageToValuesMapper messageToValuesMapper = new MessageToValuesMapper() { + + @Override + public Values toValues(Message msg) { + return new Values(new String(msg.getData())); + } + + @Override + public void declareOutputFields(OutputFieldsDeclarer declarer) { + // declare the output fields + declarer.declare(new Fields("string")); + } +}; + +// Configure a Pulsar Spout +PulsarSpoutConfiguration spoutConf = new PulsarSpoutConfiguration(); +spoutConf.setServiceUrl("pulsar://broker.messaging.usw.example.com:6650"); +spoutConf.setTopic("persistent://my-property/usw/my-ns/my-topic1"); +spoutConf.setSubscriptionName("my-subscriber-name1"); +spoutConf.setMessageToValuesMapper(messageToValuesMapper); + +// Create a Pulsar Spout +PulsarSpout spout = new PulsarSpout(spoutConf, clientConf, consumerConf); +``` + +## Pulsar Bolt + +The Pulsar bolt allows data in a Storm topology to be published on a topic. It publishes messages based on the Storm tuple received and the `TupleToMessageMapper` provided by the client. + +A partitioned topic can also be used to publish messages on different topics. In the implementation of the `TupleToMessageMapper`, a "key" will need to be provided in the message which will send the messages with the same key to the same topic. Here's an example bolt: + +```java +// Configure a Pulsar Client +ClientConfiguration clientConf = new ClientConfiguration(); + +// Configure a Pulsar Producer +ProducerConfiguration producerConf = new ProducerConfiguration(); + +@SuppressWarnings("serial") +TupleToMessageMapper tupleToMessageMapper = new TupleToMessageMapper() { + + @Override + public Message toMessage(Tuple tuple) { + String receivedMessage = tuple.getString(0); + // message processing + String processedMsg = receivedMessage + "-processed"; + return MessageBuilder.create().setContent(processedMsg.getBytes()).build(); + } + + @Override + public void declareOutputFields(OutputFieldsDeclarer declarer) { + // declare the output fields + } +}; + +// Configure a Pulsar Bolt +PulsarBoltConfiguration boltConf = new PulsarBoltConfiguration(); +boltConf.setServiceUrl("pulsar://broker.messaging.usw.example.com:6650"); +boltConf.setTopic("persistent://my-property/usw/my-ns/my-topic2"); +boltConf.setTupleToMessageMapper(tupleToMessageMapper); + +// Create a Pulsar Bolt +PulsarBolt bolt = new PulsarBolt(boltConf, clientConf); +``` + +## Example + +You can find a complete example [here](https://github.com/apache/incubator-pulsar/tree/master/pulsar-storm/src/test/java/org/apache/pulsar/storm/example/StormExample.java). diff --git a/site2/docs/admin-api-brokers.md b/site2/docs/admin-api-brokers.md new file mode 100644 index 0000000000000000000000000000000000000000..62199bfdcb7b37cd3e0745385992823edd387d8c --- /dev/null +++ b/site2/docs/admin-api-brokers.md @@ -0,0 +1,158 @@ +--- +id: admin-api-brokers +title: Managing Brokers +sidebar_label: Brokers +--- + +Pulsar brokers consist of two components: + +1. An HTTP server exposing a [REST interface](reference-rest-api.md) administration and {% popover topic %} lookup. +2. A dispatcher that handles all Pulsar {% popover message %} transfers. + +{% popover Brokers %} can be managed via: + +* The [`brokers`](reference-pulsar-admin.md#brokers) command of the [`pulsar-admin`](reference-pulsar-admin.md) tool +* The `/admin/brokers` endpoint of the admin [REST API](reference-rest-api.md) +* The `brokers` method of the {% javadoc PulsarAdmin admin org.apache.pulsar.client.admin.PulsarAdmin %} object in the [Java API](client-libraries-java.md) + +In addition to being configurable when you start them up, brokers can also be [dynamically configured](#dynamic-broker-configuration). + +{% include admonition.html type="info" content=" +See the [Configuration](reference-configuration.md#broker) page for a full listing of broker-specific configuration parameters. +" %} + +## Brokers resources + +### List active brokers + +Fetch all available active brokers that are serving traffic. + +#### pulsar-admin + + +```shell +$ pulsar-admin brokers list use +``` + +``` +broker1.use.org.com:8080 +``` + +###### REST + +{% endpoint GET /admin/brokers/:cluster %} + +[More info](reference-rest-api.md#/admin/brokers/:cluster) + +###### Java + +```java +admin.brokers().getActiveBrokers(clusterName) +``` + +#### list of namespaces owned by a given broker + +It finds all namespaces which are owned and served by a given broker. + +###### CLI + +```shell +$ pulsar-admin brokers namespaces use \ + --url broker1.use.org.com:8080 +``` + +```json +{ + "my-property/use/my-ns/0x00000000_0xffffffff": { + "broker_assignment": "shared", + "is_controlled": false, + "is_active": true + } +} +``` +###### REST + +{% endpoint GET /admin/brokers/:cluster/:broker:/ownedNamespaces %} + +###### Java + +```java +admin.brokers().getOwnedNamespaces(cluster,brokerUrl); +``` + +### Dynamic broker configuration + +One way to configure a Pulsar {% popover broker %} is to supply a [configuration](reference-configuration.md#broker) when the broker is [started up](reference-cli-tools.md#pulsar-broker). + +But since all broker configuration in Pulsar is stored in {% popover ZooKeeper %}, configuration values can also be dynamically updated *while the broker is running*. When you update broker configuration dynamically, ZooKeeper will notify the broker of the change and the broker will then override any existing configuration values. + +* The [`brokers`](reference-pulsar-admin.md#brokers) command for the [`pulsar-admin`](reference-pulsar-admin.md) tool has a variety of subcommands that enable you to manipulate a broker's configuration dynamically, enabling you to [update config values](#update-dynamic-configuration) and more. +* In the Pulsar admin [REST API](reference-rest-api.md), dynamic configuration is managed through the `/admin/brokers/configuration` endpoint. + +### Update dynamic configuration + +#### pulsar-admin + +The [`update-dynamic-config`](reference-pulsar-admin.md#brokers-update-dynamic-config) subcommand will update existing configuration. It takes two arguments: the name of the parameter and the new value. Here's an example for the [`brokerShutdownTimeoutMs`](reference-configuration.md#broker-brokerShutdownTimeoutMs) parameter: + +```shell +$ pulsar-admin brokers update-dynamic-config brokerShutdownTimeoutMs 100 +``` + +#### REST API + +{% endpoint POST /admin/brokers/configuration/:configName/:configValue %} + +[More info](reference-rest-api.md#/admin/brokers/configuration/:configName/:configValue) + +#### Java + +```java +admin.brokers().updateDynamicConfiguration(configName, configValue); +``` + +### List updated values + +Fetch a list of all potentially updatable configuration parameters. + +#### pulsar-admin + +```shell +$ pulsar-admin brokers list-dynamic-config +brokerShutdownTimeoutMs +``` + +#### REST API + +{% endpoint GET /admin/brokers/configuration %} + +[More info](reference-rest-api.md#/admin/brokers/configuration) + +#### Java + +```java +admin.brokers().getDynamicConfigurationNames(); +``` + +### List all + +Fetch a list of all parameters that have been dynamically updated. + +#### pulsar-admin + +```shell +$ pulsar-admin brokers get-all-dynamic-config +brokerShutdownTimeoutMs:100 +``` + +#### REST API + +{% endpoint GET /admin/brokers/configuration/values %} + +[More info](reference-rest-api.md#/admin/brokers/configuration/values) + +#### Java + +```java +admin.brokers().getAllDynamicConfigurations(); +``` diff --git a/site2/docs/admin-api-clusters.md b/site2/docs/admin-api-clusters.md new file mode 100644 index 0000000000000000000000000000000000000000..d808b4de9d9edc840b9e92b3b794197c276d6a2c --- /dev/null +++ b/site2/docs/admin-api-clusters.md @@ -0,0 +1,219 @@ +--- +id: admin-api-clusters +title: Managing Clusters +sidebar_label: Clusters +--- + +Pulsar clusters consist of one or more Pulsar {% popover brokers %}, one or more {% popover BookKeeper %} servers (aka {% popover bookies %}), and a {% popover ZooKeeper %} cluster that provides configuration and coordination management. + +Clusters can be managed via: + +* The [`clusters`](reference-pulsar-admin.md#clusters) command of the [`pulsar-admin`](reference-pulsar-admin.md) tool +* The `/admin/clusters` endpoint of the admin [REST API](reference-rest-api.md) +* The `clusters` method of the {% javadoc PulsarAdmin admin org.apache.pulsar.client.admin.PulsarAdmin %} object in the [Java API](client-libraries-java.md) + +## Clusters resources + +### Provision + +New clusters can be provisioned using the admin interface. + +> Please note that this operation requires superuser privileges. + +{% include message.html id="superuser" %} + +#### pulsar-admin + +You can provision a new cluster using the [`create`](reference-pulsar-admin.md#clusters-create) subcommand. Here's an example: + +```shell +$ pulsar-admin clusters create cluster-1 \ + --url http://my-cluster.org.com:8080 \ + --broker-url pulsar://my-cluster.org.com:6650 +``` + +#### REST API + +{% endpoint PUT /admin/clusters/:cluster %} + +[More info](reference-rest-api.md#/admin/clusters/:cluster) + +#### Java + +```java +ClusterData clusterData = new ClusterData( + serviceUrl, + serviceUrlTls, + brokerServiceUrl, + brokerServiceUrlTls +); +admin.clusters().createCluster(clusterName, clusterData); +``` + +### Initialize cluster metadata + +When provision a new cluster, you need to initialize that cluster's [metadata](getting-started-concepts-and-architecture.md#metadata-store). When initializing cluster metadata, you need to specify all of the following: + +* The name of the cluster +* The local ZooKeeper connection string for the cluster +* The global ZooKeeper connection string for the entire instance +* The web service URL for the cluster +* A broker service URL enabling interaction with the {% popover brokers %} in the cluster + +You must initialize cluster metadata *before* starting up any [brokers](admin-api-brokers.md) that will belong to the cluster. + +{% include admonition.html type="warning" title="No cluster metadata initialization through the REST API or the Java admin API" content=' +Unlike most other admin functions in Pulsar, cluster metadata initialization cannot be performed via the admin REST API or the admin Java client, as metadata initialization involves communicating with ZooKeeper directly. Instead, you can use the [`pulsar`](reference-cli-tools.md#pulsar) CLI tool, in particular the [`initialize-cluster-metadata`](reference-cli-tools.md#pulsar-initialize-cluster-metadata) command. +' %} + +Here's an example cluster metadata initialization command: + +```shell +bin/pulsar initialize-cluster-metadata \ + --cluster us-west \ + --zookeeper zk1.us-west.example.com:2181 \ + --global-zookeeper zk1.us-west.example.com:2184 \ + --web-service-url http://pulsar.us-west.example.com:8080/ \ + --web-service-url-tls https://pulsar.us-west.example.com:8443/ \ + --broker-service-url pulsar://pulsar.us-west.example.com:6650/ \ + --broker-service-url-tls pulsar+ssl://pulsar.us-west.example.com:6651/ +``` + +You'll need to use `--*-tls` flags only if you're using [TLS authentication](administration-auth.md#tls-client-auth) in your instance. + +### Get configuration + +You can fetch the [configuration](reference-configuration.md) for an existing cluster at any time. + +#### pulsar-admin + +Use the [`get`](reference-pulsar-admin.md#clusters-get) subcommand and specify the name of the cluster. Here's an example: + +```shell +$ pulsar-admin clusters get cluster-1 +{ + "serviceUrl": "http://my-cluster.org.com:8080/", + "serviceUrlTls": null, + "brokerServiceUrl": "pulsar://my-cluster.org.com:6650/", + "brokerServiceUrlTls": null + "peerClusterNames": null +} +``` + +#### REST API + +{% endpoint GET /admin/clusters/:cluster %} + +[More info](reference-rest-api.md#/admin/clusters/:cluster) + +#### Java + +```java +admin.clusters().getCluster(clusterName); +``` + +### Update + +You can update the configuration for an existing cluster at any time. + +#### pulsar-admin + +Use the [`update`](reference-pulsar-admin.md#clusters-update) subcommand and specify new configuration values using flags. + +```shell +$ pulsar-admin clusters update cluster-1 \ + --url http://my-cluster.org.com:4081 \ + --broker-url pulsar://my-cluster.org.com:3350 +``` + +#### REST + +{% endpoint POST /admin/clusters/:cluster %} + +[More info](reference-rest-api.md#/admin/clusters/:cluster) + +#### Java + +```java +ClusterData clusterData = new ClusterData( + serviceUrl, + serviceUrlTls, + brokerServiceUrl, + brokerServiceUrlTls +); +admin.clusters().updateCluster(clusterName, clusterData); +``` + +### Delete + +Clusters can be deleted from a Pulsar {% popover instance %}. + +#### pulsar-admin + +Use the [`delete`](reference-pulsar-admin.md#clusters-delete) subcommand and specify the name of the cluster. + +``` +$ pulsar-admin clusters delete cluster-1 +``` + +#### REST API + +{% endpoint DELETE /admin/clusters/:cluster %} + +[More info](reference-rest-api.md#/admin/clusters/:cluster) + +#### Java + +```java +admin.clusters().deleteCluster(clusterName); +``` + +### List + +You can fetch a list of all clusters in a Pulsar {% popover instance %}. + +#### pulsar-admin + +Use the [`list`](reference-pulsar-admin.md#clusters-list) subcommand. + +```shell +$ pulsar-admin clusters list +cluster-1 +cluster-2 +``` + +#### REST API + +{% endpoint GET /admin/clusters %} + +[More info](reference-rest-api.md#/admin/clusters) + +###### Java + +```java +admin.clusters().getClusters(); +``` + +### Update peer-cluster data + +Peer clusters can be configured for a given cluster in a Pulsar {% popover instance %}. + +#### pulsar-admin + +Use the [`update-peer-clusters`](reference-pulsar-admin.md#clusters-update-peer-clusters) subcommand and specify the list of peer-cluster names. + +``` +$ pulsar-admin update-peer-clusters cluster-1 --peer-clusters cluster-2 +``` + +#### REST API + +{% endpoint POST /admin/clusters/:cluster/peers %} + +[More info](reference-rest-api.md#/admin/clusters/:cluster/peers) + +#### Java + +```java +admin.clusters().updatePeerClusterNames(clusterName, peerClusterList); +``` diff --git a/site2/docs/admin-api-namespaces.md b/site2/docs/admin-api-namespaces.md new file mode 100644 index 0000000000000000000000000000000000000000..8e366e76538b24cdbddf71fdf07d545845aa1e99 --- /dev/null +++ b/site2/docs/admin-api-namespaces.md @@ -0,0 +1,668 @@ +--- +id: admin-api-namespaces +title: Managing Namespaces +sidebar_label: Namespaces +--- + +Pulsar {% popover namespaces %} are logical groupings of {% popover topics %}. + +Namespaces can be managed via: + +* The [`namespaces`](reference-pulsar-admin.md#clusters) command of the [`pulsar-admin`](reference-pulsar-admin.md) tool +* The `/admin/namespaces` endpoint of the admin [REST API](reference-rest-api.md) +* The `namespaces` method of the {% javadoc PulsarAdmin admin org.apache.pulsar.client.admin.PulsarAdmin %} object in the [Java API](client-libraries-java.md) + +## Namespaces resources + +### Create + +You can create new namespaces under a given {% popover tenant %}. + +#### pulsar-admin + +Use the [`create`](reference-pulsar-admin.md#namespaces-create) subcommand and specify the namespace by name: + +```shell +$ pulsar-admin namespaces create test-tenant/test-namespace +``` + +#### REST API + +{% endpoint PUT /admin/namespaces/:tenant/:cluster/:namespace %} + +[More info](reference-rest-api.md#/admin/namespaces/:tenant/:cluster/:namespace) + +#### Java + +```java +admin.namespaces().createNamespace(namespace); +``` + +### Get policies + +You can fetch the current policies associated with a namespace at any time. + +#### pulsar-admin + +Use the [`policies`](reference-pulsar-admin.md#namespaces-policies) subcommand and specify the namespace: + +```shell +$ pulsar-admin namespaces policies test-tenant/test-namespace +{ + "auth_policies": { + "namespace_auth": {}, + "destination_auth": {} + }, + "replication_clusters": [], + "bundles_activated": true, + "bundles": { + "boundaries": [ + "0x00000000", + "0xffffffff" + ], + "numBundles": 1 + }, + "backlog_quota_map": {}, + "persistence": null, + "latency_stats_sample_rate": {}, + "message_ttl_in_seconds": 0, + "retention_policies": null, + "deleted": false +} +``` + +#### REST API + +{% endpoint GET /admin/namespaces/:tenant/:cluster/:namespace %} + +[More info](reference-rest-api.md#/admin/namespaces/:tenant/:cluster/:namespace) + +#### Java + +```java +admin.namespaces().getPolicies(namespace); +``` + +### List namespaces within a tenant + +You can list all namespaces within a given Pulsar {% popover tenant %}. + +#### pulsar-admin + +Use the [`list`](reference-pulsar-admin.md#namespaces-list) subcommand and specify the tenant: + +```shell +$ pulsar-admin namespaces list test-tenant +test-tenant/ns1 +test-tenant/ns2 +``` + +#### REST API + +{% endpoint GET /admin/namespaces/:tenant %} + +[More info](reference-rest-api.md#/admin/namespaces/:tenant) + +#### Java + +```java +admin.namespaces().getNamespaces(tenant); +``` + +### List namespaces within a cluster + +You can list all namespaces within a given Pulsar {% popover cluster %}. + +#### pulsar-admin + +Use the [`list-cluster`](reference-pulsar-admin.md#namespaces-list-cluster) subcommand and specify the cluster: + +```shell +$ pulsar-admin namespaces list-cluster test-tenant/cl1 +test-tenant/ns1 +test-tenant/ns2 +``` + +#### REST API + +{% endpoint GET /admin/namespaces/:tenant/:cluster %} + +[More info](reference-rest-api.md#/admin/namespaces/:tenant/:cluster) + +#### Java + +```java +admin.namespaces().getNamespaces(tenant); +``` + +### Delete + +You can delete existing namespaces from a tenant. + +#### pulsar-admin + +Use the [`delete`](reference-pulsar-admin.md#namespaces-delete) subcommand and specify the namespace: + +```shell +$ pulsar-admin namespaces delete test-tenant/ns1 +``` + +#### REST + +{% endpoint DELETE /admin/namespaces/:tenant/:cluster/:namespace %} + +[More info](reference-rest-api.md#/admin/namespaces/:tenant/:cluster/:namespace) + +#### Java + +```java +admin.namespaces().deleteNamespace(namespace); +``` + + +#### set replication cluster + +It sets replication clusters for a namespace, so Pulsar can internally replicate publish message from one colo to another colo. + +###### CLI + +``` +$ pulsar-admin namespaces set-clusters test-tenant/ns1 \ + --clusters cl1 +``` + +###### REST + +``` +{% endpoint POST /admin/namespaces/:tenant/:namespace/replication %} +``` + +###### Java + +```java +admin.namespaces().setNamespaceReplicationClusters(namespace, clusters); +``` + +#### get replication cluster + +It gives a list of replication clusters for a given namespace. + +###### CLI + +``` +$ pulsar-admin namespaces get-clusters test-tenant/cl1/ns1 +``` + +``` +cl2 +``` + +###### REST + +``` +GET /admin/namespaces/{tenant}/{namespace}/replication +``` + +###### Java + +```java +admin.namespaces().getNamespaceReplicationClusters(namespace) +``` + +#### set backlog quota policies + +Backlog quota helps broker to restrict bandwidth/storage of a namespace once it reach certain threshold limit . Admin can set this limit and one of the following action after the limit is reached. + + 1. producer_request_hold: broker will hold and not persist produce request payload + + 2. producer_exception: broker will disconnects with client by giving exception + + 3. consumer_backlog_eviction: broker will start discarding backlog messages + + Backlog quota restriction can be taken care by defining restriction of backlog-quota-type: destination_storage + +###### CLI + +``` +$ pulsar-admin namespaces set-backlog-quota --limit 10 --policy producer_request_hold test-tenant/ns1 +``` + +``` +N/A +``` + +###### REST + +``` +POST /admin/namespaces/{tenant}/{namespace}/backlogQuota +``` + +###### Java + +```java +admin.namespaces().setBacklogQuota(namespace, new BacklogQuota(limit, policy)) +``` + +#### get backlog quota policies + +It shows a configured backlog quota for a given namespace. + +###### CLI + +``` +$ pulsar-admin namespaces get-backlog-quotas test-tenant/ns1 +``` + +```json +{ + "destination_storage": { + "limit": 10, + "policy": "producer_request_hold" + } +} +``` + +###### REST + +``` +GET /admin/namespaces/{tenant}/{namespace}/backlogQuotaMap +``` + +###### Java + +```java +admin.namespaces().getBacklogQuotaMap(namespace); +``` + +#### remove backlog quota policies + +It removes backlog quota policies for a given namespace + +###### CLI + +``` +$ pulsar-admin namespaces remove-backlog-quota test-tenant/ns1 +``` + +``` +N/A +``` + +###### REST + +``` +DELETE /admin/namespaces/{tenant}/{namespace}/backlogQuota +``` + +###### Java + +```java +admin.namespaces().removeBacklogQuota(namespace, backlogQuotaType) +``` + +#### set persistence policies + +Persistence policies allow to configure persistency-level for all topic messages under a given namespace. + + - Bookkeeper-ack-quorum: Number of acks (guaranteed copies) to wait for each entry, default: 0 + + - Bookkeeper-ensemble: Number of bookies to use for a topic, default: 0 + + - Bookkeeper-write-quorum: How many writes to make of each entry, default: 0 + + - Ml-mark-delete-max-rate: Throttling rate of mark-delete operation (0 means no throttle), default: 0.0 + +###### CLI + +``` +$ pulsar-admin namespaces set-persistence --bookkeeper-ack-quorum 2 --bookkeeper-ensemble 3 --bookkeeper-write-quorum 2 --ml-mark-delete-max-rate 0 test-tenant/ns1 +``` + +``` +N/A +``` + +###### REST + +``` +POST /admin/persistent/{tenant}/{namespace}/persistence +``` + +###### Java + +```java +admin.namespaces().setPersistence(namespace,new PersistencePolicies(bookkeeperEnsemble, bookkeeperWriteQuorum,bookkeeperAckQuorum,managedLedgerMaxMarkDeleteRate)) +``` + + +#### get persistence policies + +It shows configured persistence policies of a given namespace. + +###### CLI + +``` +$ pulsar-admin namespaces get-persistence test-tenant/ns1 +``` + +```json +{ + "bookkeeperEnsemble": 3, + "bookkeeperWriteQuorum": 2, + "bookkeeperAckQuorum": 2, + "managedLedgerMaxMarkDeleteRate": 0 +} +``` + +###### REST + +``` +GET /admin/namespaces/{tenant}/{namespace}/persistence +``` + +###### Java + +```java +admin.namespaces().getPersistence(namespace) +``` + + +#### unload namespace bundle + +Namespace bundle is a virtual group of topics which belong to same namespace. If broker gets overloaded with number of bundles then this command can help to unload heavy bundle from that broker, so it can be served by some other less loaded broker. Namespace bundle is defined with it’s start and end range such as 0x00000000 and 0xffffffff. + +###### CLI + +``` +$ pulsar-admin namespaces unload --bundle 0x00000000_0xffffffff test-tenant/ns1 +``` + +``` +N/A +``` + +###### REST + +``` +PUT /admin/namespaces/{tenant}/{namespace}/unload +``` + +###### Java + +```java +admin.namespaces().unloadNamespaceBundle(namespace, bundle) +``` + + +#### set message-ttl + +It configures message’s time to live (in seconds) duration. + +###### CLI + +``` +$ pulsar-admin namespaces set-message-ttl --messageTTL 100 test-tenant/ns1 +``` + +``` +N/A +``` + +###### REST + +``` +POST /admin/namespaces/{tenant}/{namespace}/messageTTL +``` + +###### Java + +```java +admin.namespaces().setNamespaceMessageTTL(namespace, messageTTL) +``` + +#### get message-ttl + +It gives a message ttl of configured namespace. + +###### CLI + +``` +$ pulsar-admin namespaces get-message-ttl test-tenant/ns1 +``` + +``` +100 +``` + + +###### REST + +``` +GET /admin/namespaces/{tenant}/{namespace}/messageTTL +``` + +###### Java + +```java +admin.namespaces().getNamespaceReplicationClusters(namespace) +``` + + +#### split bundle + +Each namespace bundle can contain multiple topics and each bundle can be served by only one broker. If bundle gets heavy with multiple live topics in it then it creates load on that broker and in order to resolve this issue, admin can split bundle using this command. + +###### CLI + +``` +$ pulsar-admin namespaces split-bundle --bundle 0x00000000_0xffffffff test-tenant/ns1 +``` + +``` +N/A +``` + +###### REST + +``` +PUT /admin/namespaces/{tenant}/{namespace}/{bundle}/split +``` + +###### Java + +```java +admin.namespaces().splitNamespaceBundle(namespace, bundle) +``` + + +#### clear backlog + +It clears all message backlog for all the topics those belong to specific namespace. You can also clear backlog for a specific subscription as well. + +###### CLI + +``` +$ pulsar-admin namespaces clear-backlog --sub my-subscription test-tenant/ns1 +``` + +``` +N/A +``` + +###### REST + +``` +POST /admin/namespaces/{tenant}/{namespace}/clearBacklog +``` + +###### Java + +```java +admin.namespaces().clearNamespaceBacklogForSubscription(namespace, subscription) +``` + + +#### clear bundle backlog + +It clears all message backlog for all the topics those belong to specific NamespaceBundle. You can also clear backlog for a specific subscription as well. + +###### CLI + +``` +$ pulsar-admin namespaces clear-backlog --bundle 0x00000000_0xffffffff --sub my-subscription test-tenant/ns1 +``` + +``` +N/A +``` + +###### REST + +``` +POST /admin/namespaces/{tenant}/{namespace}/{bundle}/clearBacklog +``` + +###### Java + +```java +admin.namespaces().clearNamespaceBundleBacklogForSubscription(namespace, bundle, subscription) +``` + + +#### set retention + +Each namespace contains multiple topics and each topic’s retention size (storage size) should not exceed to a specific threshold or it should be stored till certain time duration. This command helps to configure retention size and time of topics in a given namespace. + +###### CLI + +``` +$ pulsar-admin set-retention --size 10 --time 100 test-tenant/ns1 +``` + +``` +N/A +``` + +###### REST + +``` +POST /admin/namespaces/{tenant}/{namespace}/retention +``` + +###### Java + +```java +admin.namespaces().setRetention(namespace, new RetentionPolicies(retentionTimeInMin, retentionSizeInMB)) +``` + + +#### get retention + +It shows retention information of a given namespace. + +###### CLI + +``` +$ pulsar-admin namespaces get-retention test-tenant/ns1 +``` + +```json +{ + "retentionTimeInMinutes": 10, + "retentionSizeInMB": 100 +} +``` + +###### REST + +``` +GET /admin/namespaces/{tenant}/{namespace}/retention +``` + +###### Java + +```java +admin.namespaces().getRetention(namespace) +``` + +#### set dispatch throttling + +It sets message dispatch rate for all the topics under a given namespace. +Dispatch rate can be restricted by number of message per X seconds (`msg-dispatch-rate`) or by number of message-bytes per X second (`byte-dispatch-rate`). +dispatch rate is in second and it can be configured with `dispatch-rate-period`. Default value of `msg-dispatch-rate` and `byte-dispatch-rate` is -1 which +disables the throttling. + +###### CLI + +``` +$ pulsar-admin namespaces set-dispatch-rate test-tenant/ns1 \ + --msg-dispatch-rate 1000 \ + --byte-dispatch-rate 1048576 \ + --dispatch-rate-period 1 +``` + +###### REST + +``` +POST /admin/namespaces/{tenant}/{namespace}/dispatchRate +``` + +###### Java + +```java +admin.namespaces().setDispatchRate(namespace, 1000, 1048576, 1) +``` + +#### get configured message-rate + +It shows configured message-rate for the namespace (topics under this namespace can dispatch this many messages per second) + +###### CLI + +``` +$ pulsar-admin namespaces get-dispatch-rate test-tenant/ns1 +``` + +```json +{ + "dispatchThrottlingRatePerTopicInMsg" : 1000, + "dispatchThrottlingRatePerTopicInByte" : 1048576, + "ratePeriodInSecond" : 1 +} +``` + +###### REST + +``` +GET /admin/namespaces/{tenant}/{namespace}/dispatchRate +``` + +###### Java + +```java +admin.namespaces().getDispatchRate(namespace) +``` + + +### Namespace isolation + +Coming soon. + +### Unloading from a broker + +You can unload a namespace, or a {% popover namespace bundle %}, from the Pulsar {% popover broker %} that is currently responsible for it. + +#### pulsar-admin + +Use the [`unload`](reference-pulsar-admin.md#namespaces-unload) subcommand of the [`namespaces`](reference-pulsar-admin.md#namespaces) command. + +##### Example + +```shell +$ pulsar-admin namespaces unload my-tenant/my-ns +``` + +#### REST API + +#### Java + diff --git a/site2/docs/admin-api-non-persistent-topics.md b/site2/docs/admin-api-non-persistent-topics.md new file mode 100644 index 0000000000000000000000000000000000000000..eef44079a3f88499bc400559547c1d26d52783a5 --- /dev/null +++ b/site2/docs/admin-api-non-persistent-topics.md @@ -0,0 +1,254 @@ +--- +id: admin-api-non-persistent-topics +title: Managing non-persistent topics +sidebar_label: Non-Persistent topics +--- + +Non-persistent can be used in applications that only want to consume real time published messages and +do not need persistent guarantee that can also reduce message-publish latency by removing overhead of +persisting messages. + +In all of the instructions and commands below, the topic name structure is: + +{% include topic.html ten="tenant" n="namespace" t="topic" %} + +## Non-persistent topics resources + +### Get stats + +It shows current statistics of a given non-partitioned topic. + + - **msgRateIn**: The sum of all local and replication publishers' publish rates in messages per second + + - **msgThroughputIn**: Same as above, but in bytes per second instead of messages per second + + - **msgRateOut**: The sum of all local and replication consumers' dispatch rates in messages per second + + - **msgThroughputOut**: Same as above, but in bytes per second instead of messages per second + + - **averageMsgSize**: The average size in bytes of messages published within the last interval + + - **publishers**: The list of all local publishers into the topic. There can be zero or thousands + + - **averageMsgSize**: Average message size in bytes from this publisher within the last interval + + - **producerId**: Internal identifier for this producer on this topic + + - **producerName**: Internal identifier for this producer, generated by the client library + + - **address**: IP address and source port for the connection of this producer + + - **connectedSince**: Timestamp this producer was created or last reconnected + + - **subscriptions**: The list of all local subscriptions to the topic + + - **my-subscription**: The name of this subscription (client defined) + + - **type**: This subscription type + + - **consumers**: The list of connected consumers for this subscription + + - **consumerName**: Internal identifier for this consumer, generated by the client library + + - **availablePermits**: The number of messages this consumer has space for in the client library's listen queue. A value of 0 means the client library's queue is full and receive() isn't being called. A nonzero value means this consumer is ready to be dispatched messages. + + - **replication**: This section gives the stats for cross-colo replication of this topic + + - **connected**: Whether the outbound replicator is connected + + - **inboundConnection**: The IP and port of the broker in the remote cluster's publisher connection to this broker + + - **inboundConnectedSince**: The TCP connection being used to publish messages to the remote cluster. If there are no local publishers connected, this connection is automatically closed after a minute. + + - **msgDropRate**: for publisher: publish: broker only allows configured number of in flight per connection, and drops all other published messages above the threshold. Broker also drops messages for subscriptions in case of unavailable limit and connection is not writable. + + +```json +{ + "msgRateIn": 4641.528542257553, + "msgThroughputIn": 44663039.74947473, + "msgRateOut": 0, + "msgThroughputOut": 0, + "averageMsgSize": 1232439.816728665, + "storageSize": 135532389160, + "msgDropRate" : 0.0, + "publishers": [ + { + "msgRateIn": 57.855383881403576, + "msgThroughputIn": 558994.7078932219, + "averageMsgSize": 613135, + "producerId": 0, + "producerName": null, + "address": null, + "connectedSince": null, + "msgDropRate" : 0.0 + } + ], + "subscriptions": { + "my-topic_subscription": { + "msgRateOut": 0, + "msgThroughputOut": 0, + "msgBacklog": 116632, + "type": null, + "msgRateExpired": 36.98245516804671, + "consumers" : [ { + "msgRateOut" : 20343.506296021893, + "msgThroughputOut" : 2.0979855364233278E7, + "msgRateRedeliver" : 0.0, + "consumerName" : "fe3c0", + "availablePermits" : 950, + "unackedMessages" : 0, + "blockedConsumerOnUnackedMsgs" : false, + "address" : "/10.73.210.249:60578", + "connectedSince" : "2017-07-26 15:13:48.026-0700", + "clientVersion" : "1.19-incubating-SNAPSHOT" + } ], + "msgDropRate" : 432.2390921571593 + + } + }, + "replication": {} +} +``` + +#### pulsar-admin + +Topic stats can be fetched using [`stats`](reference-pulsar-admin.md#stats) command. + +```shell +$ pulsar-admin non-persistent stats \ + non-persistent://test-tenant/ns1/tp1 \ +``` + +#### REST API + +{% endpoint GET /admin/non-persistent/:tenant/:namespace/:destination/stats %} + + +#### Java + +```java +String destination = "non-persistent://my-tenant/my-namespace/my-topic"; +admin.nonPersistentTopics().getStats(destination); +``` + +### Get internal stats + +It shows detailed statistics of a topic. + +#### pulsar-admin + +Topic internal-stats can be fetched using [`stats-internal`](reference-pulsar-admin.md#stats-internal) command. + +```shell +$ pulsar-admin non-persistent stats-internal \ + non-persistent://test-tenant/ns1/tp1 \ + +{ + "entriesAddedCounter" : 48834, + "numberOfEntries" : 0, + "totalSize" : 0, + "cursors" : { + "s1" : { + "waitingReadOp" : false, + "pendingReadOps" : 0, + "messagesConsumedCounter" : 0, + "cursorLedger" : 0, + "cursorLedgerLastEntry" : 0 + } + } +} + +``` + +#### REST API + +{% endpoint GET /admin/non-persistent/:tenant/:namespace/:destination/internalStats %} + + +#### Java + +```java +String destination = "non-persistent://my-tenant/my-namespace/my-topic"; +admin.nonPersistentTopics().getInternalStats(destination); +``` + +### Create partitioned topic + +Partitioned topics in Pulsar must be explicitly created. When creating a new partitioned topic you need to provide a name for the topic as well as the desired number of partitions. + +#### pulsar-admin + +```shell +$ bin/pulsar-admin non-persistent create-partitioned-topic \ + non-persistent://my-tenant/my-namespace/my-topic \ + --partitions 4 +``` + +#### REST API + +{% endpoint PUT /admin/non-persistent/:tenant/:namespace/:destination/partitions %} + +#### Java + +```java +String topicName = "non-persistent://my-tenant/my-namespace/my-topic"; +int numPartitions = 4; +admin.nonPersistentTopics().createPartitionedTopic(topicName, numPartitions); +``` + +### Get metadata + +Partitioned topics have metadata associated with them that you can fetch as a JSON object. The following metadata fields are currently available: + +Field | Meaning +:-----|:------- +`partitions` | The number of partitions into which the topic is divided + +#### pulsar-admin + +```shell +$ pulsar-admin non-persistent get-partitioned-topic-metadata \ + non-persistent://my-tenant/my-namespace/my-topic +{ + "partitions": 4 +} +``` + +#### REST API + +{% endpoint GET /admin/non-persistent/:tenant/:namespace/:destination/partitions %} + + +#### Java + +```java +String topicName = "non-persistent://my-tenant/my-namespace/my-topic"; +admin.nonPersistentTopics().getPartitionedTopicMetadata(topicName); +``` + +### Unload topic + +It unloads a topic. + +#### pulsar-admin + +Topic can be unloaded using [`unload`](reference-pulsar-admin.md#unload) command. + +```shell +$ pulsar-admin non-persistent unload \ + non-persistent://test-tenant/ns1/tp1 \ +``` + +#### REST API + +{% endpoint PUT /admin/non-persistent/:tenant/:namespace/:destination/unload %} + +[More info](reference-rest-api.md#/admin/non-persistent/:tenant/:namespace/:destination/unload) + +#### Java + +```java +String destination = "non-persistent://my-tenantmy-namespace/my-topic"; +admin.nonPersistentTopics().unload(destination); +``` diff --git a/site2/docs/admin-api-overview.md b/site2/docs/admin-api-overview.md new file mode 100644 index 0000000000000000000000000000000000000000..a730a7d130f8be88e8cdb841e6192d5b44d1e927 --- /dev/null +++ b/site2/docs/admin-api-overview.md @@ -0,0 +1,68 @@ +--- +id: admin-api-overview +title: The Pulsar admin interface +sidebar_label: Overview +--- + +The Pulsar admin interface enables you to manage all of the important entities in a Pulsar {% popover instance %}, such as {% popover properties %}, {% popover topics %}, and {% popover namespaces %}. + +You can currently interact with the admin interface via: + +- Making HTTP calls against the admin [REST API](reference-rest-api.md) provided by Pulsar {% popover brokers %}. For some restful apis, they might be redirected to topic owner brokers for serving + with [`307 Temporary Redirect`](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/307), hence the HTTP callers should handle `307 Temporary Redirect`. If you are using `curl`, you should specify `-L` + to handle redirections. +- The `pulsar-admin` CLI tool, which is available in the `bin` folder of your [Pulsar installation](getting-started-standalone.md): + +```shell +$ bin/pulsar-admin +``` + +Full documentation for this tool can be found in the [Pulsar command-line tools](reference-pulsar-admin.md) doc. + +- A Java client interface. + +> #### The REST API is the admin interface +> Under the hood, both the `pulsar-admin` CLI tool and the Java client both use the REST API. If you’d like to implement your own admin interface client, you should use the REST API as well. Full documentation can be found here. + +{% include message.html id="admin_rest_api" %} + +In this document, examples from each of the three available interfaces will be shown. + +## Admin setup + +{% include explanations/admin-setup.md %} + +Each of Pulsar's three admin interfaces---the [`pulsar-admin`](reference-pulsar-admin.md) CLI tool, the [Java admin API](/api/admin), and the [REST API](reference-rest-api.md)---requires some special setup if you have [authentication](administration-auth.md#authentication-providers) enabled in your Pulsar {% popover instance %}. + +### pulsar-admin + +If you have [authentication](administration-auth.md#authentication-providers) enabled, you will need to provide an auth configuration to use the [`pulsar-admin`](reference-pulsar-admin.md) tool. By default, the configuration for the `pulsar-admin` tool is found in the [`conf/client.conf`](reference-configuration.md#client) file. Here are the available parameters: + +{% include config.html id="client" %} + +### REST API + +You can find documentation for the REST API exposed by Pulsar {% popover brokers %} in [this reference document](reference-rest-api.md). + +### Java admin client + +To use the Java admin API, instantiate a {% javadoc PulsarAdmin admin org.apache.pulsar.client.admin.PulsarAdmin %} object, specifying a URL for a Pulsar {% popover broker %} and a {% javadoc ClientConfiguration admin org.apache.pulsar.client.admin.ClientConfiguration %}. Here's a minimal example using `localhost`: + +```java +URL url = new URL("http://localhost:8080"); +// Pass auth-plugin class fully-qualified name if Pulsar-security enabled +String authPluginClassName = "com.org.MyAuthPluginClass"; +// Pass auth-param if auth-plugin class requires it +String authParams = "param1=value1"; +boolean useTls = false; +boolean tlsAllowInsecureConnection = false; +String tlsTrustCertsFilePath = null; + +ClientConfiguration config = new ClientConfiguration(); +config.setAuthentication(authPluginClassName, authParams); +config.setUseTls(useTls); +config.setTlsAllowInsecureConnection(tlsAllowInsecureConnection); +config.setTlsTrustCertsFilePath(tlsTrustCertsFilePath); + +PulsarAdmin admin = new PulsarAdmin(url, config); +``` diff --git a/site2/docs/admin-api-partitioned-topics.md b/site2/docs/admin-api-partitioned-topics.md new file mode 100644 index 0000000000000000000000000000000000000000..db497186d06bf2ec2cc3234bba955264d60ccefc --- /dev/null +++ b/site2/docs/admin-api-partitioned-topics.md @@ -0,0 +1,16 @@ +--- +id: admin-api-partitioned-topics +title: Managing partitioned topics +sidebar_label: Partitioned topics +--- + + +You can use Pulsar's [admin API](admin-api-overview.md) to create and manage partitioned topics. + +In all of the instructions and commands below, the topic name structure is: + +{% include topic.html ten="tenant" n="namespace" t="topic" %} + +## Partitioned topics resources + +{% include explanations/partitioned-topic-admin.md %} diff --git a/site2/docs/admin-api-permissions.md b/site2/docs/admin-api-permissions.md new file mode 100644 index 0000000000000000000000000000000000000000..8a7793e9b17399ffe66b99c25c73479bb129df2d --- /dev/null +++ b/site2/docs/admin-api-permissions.md @@ -0,0 +1,9 @@ +--- +id: admin-api-permissions +title: Managing permissions +sidebar_label: Persmissions +--- + +## Permissions resources + +{% include explanations/permissions.md %} diff --git a/site2/docs/admin-api-persistent-topics.md b/site2/docs/admin-api-persistent-topics.md new file mode 100644 index 0000000000000000000000000000000000000000..6a7cb2e1daa20ca263ddc09150da10dfaad165b7 --- /dev/null +++ b/site2/docs/admin-api-persistent-topics.md @@ -0,0 +1,627 @@ +--- +id: admin-api-persistent-topics +title: Managing persistent topics +sidebar_label: Persistent topics +--- + +Persistent helps to access topic which is a logical endpoint for publishing and consuming messages. Producers publish messages to the topic and consumers subscribe to the topic, to consume messages published to the topic. + +In all of the instructions and commands below, the topic name structure is: + +{% include topic.html ten="tenant" n="namespace" t="topic" %} + +## Persistent topics resources + +### List of topics + +It provides a list of persistent topics exist under a given namespace. + +#### pulsar-admin + +List of topics can be fetched using [`list`](../../reference/CliTools#list) command. + +```shell +$ pulsar-admin persistent list \ + my-tenant/my-namespace +``` + +#### REST API + +{% endpoint GET /admin/persistent/:tenant/:namespace %} + +[More info](../../reference/RestApi#/admin/persistent/:tenant/:namespace) + +#### Java + +```java +String namespace = "my-tenant/my-namespace"; +admin.persistentTopics().getList(namespace); +``` + +### Grant permission + +It grants permissions on a client role to perform specific actions on a given topic. + +#### pulsar-admin + +Permission can be granted using [`grant-permission`](../../reference/CliTools#grant-permission) command. + +```shell +$ pulsar-admin persistent grant-permission \ + --actions produce,consume --role application1 \ + persistent://test-tenant/ns1/tp1 \ + +``` + +#### REST API + +{% endpoint POST /admin/namespaces/:tenant/:namespace/permissions/:role %} + +[More info](../../reference/RestApi#/admin/namespaces/:tenant/:namespace/permissions/:role) + +#### Java + +```java +String destination = "persistent://my-tenant/my-namespace/my-topic"; +String role = "test-role"; +Set actions = Sets.newHashSet(AuthAction.produce, AuthAction.consume); +admin.persistentTopics().grantPermission(destination, role, actions); +``` + +### Get permission + +Permission can be fetched using [`permissions`](../../reference/CliTools#permissions) command. + +#### pulsar-admin + +```shell +$ pulsar-admin persistent permissions \ + persistent://test-tenant/ns1/tp1 \ + +{ + "application1": [ + "consume", + "produce" + ] +} +``` + +#### REST API + +{% endpoint GET /admin/namespaces/:tenant/:namespace/permissions %} + +[More info](../../reference/RestApi#/admin/namespaces/:tenant:namespace/permissions) + +#### Java + +```java +String destination = "persistent://my-tenant/my-namespace/my-topic"; +admin.persistentTopics().getPermissions(destination); +``` + +### Revoke permission + +It revokes a permission which was granted on a client role. + +#### pulsar-admin + +Permission can be revoked using [`revoke-permission`](../../reference/CliTools#revoke-permission) command. + +```shell +$ pulsar-admin persistent revoke-permission \ + --role application1 \ + persistent://test-tenant/ns1/tp1 \ + +{ + "application1": [ + "consume", + "produce" + ] +} +``` + +#### REST API + +{% endpoint DELETE /admin/namespaces/:tenant:namespace/permissions/:role %} + +[More info](../../reference/RestApi#/admin/namespaces/:tenant/:namespace/permissions/:role) + +#### Java + +```java +String destination = "persistent://my-tenant/my-namespace/my-topic"; +String role = "test-role"; +admin.persistentTopics().revokePermissions(destination, role); +``` + +### Delete topic + +It deletes a topic. The topic cannot be deleted if there's any active subscription or producers connected to it. + +#### pulsar-admin + +Topic can be deleted using [`delete`](../../reference/CliTools#delete) command. + +```shell +$ pulsar-admin persistent delete \ + persistent://test-tenant/ns1/tp1 \ +``` + +#### REST API + +{% endpoint DELETE /admin/persistent/:tenant/:namespace/:destination %} + +[More info](../../reference/RestApi#/admin/persistent/:tenant/:namespace/:destination) + +#### Java + +```java +String destination = "persistent://my-tenant/my-namespace/my-topic"; +admin.persistentTopics().delete(destination); +``` + +### Unload topic + +It unloads a topic. + +#### pulsar-admin + +Topic can be unloaded using [`unload`](../../reference/CliTools#unload) command. + +```shell +$ pulsar-admin persistent unload \ + persistent://test-tenant/ns1/tp1 \ +``` + +#### REST API + +{% endpoint PUT /admin/persistent/:tenant/:namespace/:destination/unload %} + +[More info](../../reference/RestApi#/admin/persistent/:tenant/:namespace/:destination/unload) + +#### Java + +```java +String destination = "persistent://my-tenant/my-namespace/my-topic"; +admin.persistentTopics().unload(destination); +``` + +### Get stats + +It shows current statistics of a given non-partitioned topic. + + - **msgRateIn**: The sum of all local and replication publishers' publish rates in messages per second + + - **msgThroughputIn**: Same as above, but in bytes per second instead of messages per second + + - **msgRateOut**: The sum of all local and replication consumers' dispatch rates in messages per second + + - **msgThroughputOut**: Same as above, but in bytes per second instead of messages per second + + - **averageMsgSize**: The average size in bytes of messages published within the last interval + + - **storageSize**: The sum of the ledgers' storage size for this topic. See + + - **publishers**: The list of all local publishers into the topic. There can be zero or thousands + + - **averageMsgSize**: Average message size in bytes from this publisher within the last interval + + - **producerId**: Internal identifier for this producer on this topic + + - **producerName**: Internal identifier for this producer, generated by the client library + + - **address**: IP address and source port for the connection of this producer + + - **connectedSince**: Timestamp this producer was created or last reconnected + + - **subscriptions**: The list of all local subscriptions to the topic + + - **my-subscription**: The name of this subscription (client defined) + + - **msgBacklog**: The count of messages in backlog for this subscription + + - **type**: This subscription type + + - **msgRateExpired**: The rate at which messages were discarded instead of dispatched from this subscription due to TTL + + - **consumers**: The list of connected consumers for this subscription + + - **consumerName**: Internal identifier for this consumer, generated by the client library + + - **availablePermits**: The number of messages this consumer has space for in the client library's listen queue. A value of 0 means the client library's queue is full and receive() isn't being called. A nonzero value means this consumer is ready to be dispatched messages. + + - **replication**: This section gives the stats for cross-colo replication of this topic + + - **replicationBacklog**: The outbound replication backlog in messages + + - **connected**: Whether the outbound replicator is connected + + - **replicationDelayInSeconds**: How long the oldest message has been waiting to be sent through the connection, if connected is true + + - **inboundConnection**: The IP and port of the broker in the remote cluster's publisher connection to this broker + + - **inboundConnectedSince**: The TCP connection being used to publish messages to the remote cluster. If there are no local publishers connected, this connection is automatically closed after a minute. + +```json +{ + "msgRateIn": 4641.528542257553, + "msgThroughputIn": 44663039.74947473, + "msgRateOut": 0, + "msgThroughputOut": 0, + "averageMsgSize": 1232439.816728665, + "storageSize": 135532389160, + "publishers": [ + { + "msgRateIn": 57.855383881403576, + "msgThroughputIn": 558994.7078932219, + "averageMsgSize": 613135, + "producerId": 0, + "producerName": null, + "address": null, + "connectedSince": null + } + ], + "subscriptions": { + "my-topic_subscription": { + "msgRateOut": 0, + "msgThroughputOut": 0, + "msgBacklog": 116632, + "type": null, + "msgRateExpired": 36.98245516804671, + "consumers": [] + } + }, + "replication": {} +} +``` + +#### pulsar-admin + +Topic stats can be fetched using [`stats`](../../reference/CliTools#stats) command. + +```shell +$ pulsar-admin persistent stats \ + persistent://test-tenant/ns1/tp1 \ +``` + +#### REST API + +{% endpoint GET /admin/persistent/:tenant/:namespace/:destination/stats %} + +[More info](../../reference/RestApi#/admin/persistent/:tenant:namespace/:destination/stats) + +#### Java + +```java +String destination = "persistent://my-tenant/my-namespace/my-topic"; +admin.persistentTopics().getStats(destination); +``` + +### Get internal stats + +It shows detailed statistics of a topic. + + - **entriesAddedCounter**: Messages published since this broker loaded this topic + + - **numberOfEntries**: Total number of messages being tracked + + - **totalSize**: Total storage size in bytes of all messages + + - **currentLedgerEntries**: Count of messages written to the ledger currently open for writing + + - **currentLedgerSize**: Size in bytes of messages written to ledger currently open for writing + + - **lastLedgerCreatedTimestamp**: time when last ledger was created + + - **lastLedgerCreationFailureTimestamp:** time when last ledger was failed + + - **waitingCursorsCount**: How many cursors are "caught up" and waiting for a new message to be published + + - **pendingAddEntriesCount**: How many messages have (asynchronous) write requests we are waiting on completion + + - **lastConfirmedEntry**: The ledgerid:entryid of the last message successfully written. If the entryid is -1, then the ledger has been opened or is currently being opened but has no entries written yet. + + - **state**: The state of this ledger for writing. LedgerOpened means we have a ledger open for saving published messages. + + - **ledgers**: The ordered list of all ledgers for this topic holding its messages + + - **cursors**: The list of all cursors on this topic. There will be one for every subscription you saw in the topic stats. + + - **markDeletePosition**: The ack position: the last message the subscriber acknowledged receiving + + - **readPosition**: The latest position of subscriber for reading message + + - **waitingReadOp**: This is true when the subscription has read the latest message published to the topic and is waiting on new messages to be published. + + - **pendingReadOps**: The counter for how many outstanding read requests to the BookKeepers we have in progress + + - **messagesConsumedCounter**: Number of messages this cursor has acked since this broker loaded this topic + + - **cursorLedger**: The ledger being used to persistently store the current markDeletePosition + + - **cursorLedgerLastEntry**: The last entryid used to persistently store the current markDeletePosition + + - **individuallyDeletedMessages**: If Acks are being done out of order, shows the ranges of messages Acked between the markDeletePosition and the read-position + + - **lastLedgerSwitchTimestamp**: The last time the cursor ledger was rolled over + + - **state**: The state of the cursor ledger: Open means we have a cursor ledger for saving updates of the markDeletePosition. + +```json +{ + "entriesAddedCounter": 20449518, + "numberOfEntries": 3233, + "totalSize": 331482, + "currentLedgerEntries": 3233, + "currentLedgerSize": 331482, + "lastLedgerCreatedTimestamp": "2016-06-29 03:00:23.825", + "lastLedgerCreationFailureTimestamp": null, + "waitingCursorsCount": 1, + "pendingAddEntriesCount": 0, + "lastConfirmedEntry": "324711539:3232", + "state": "LedgerOpened", + "ledgers": [ + { + "ledgerId": 324711539, + "entries": 0, + "size": 0 + } + ], + "cursors": { + "my-subscription": { + "markDeletePosition": "324711539:3133", + "readPosition": "324711539:3233", + "waitingReadOp": true, + "pendingReadOps": 0, + "messagesConsumedCounter": 20449501, + "cursorLedger": 324702104, + "cursorLedgerLastEntry": 21, + "individuallyDeletedMessages": "[(324711539:3134‥324711539:3136], (324711539:3137‥324711539:3140], ]", + "lastLedgerSwitchTimestamp": "2016-06-29 01:30:19.313", + "state": "Open" + } + } +} +``` + + +#### pulsar-admin + +Topic internal-stats can be fetched using [`stats-internal`](../../reference/CliTools#stats-internal) command. + +```shell +$ pulsar-admin persistent stats-internal \ + persistent://test-tenant/ns1/tp1 \ +``` + +#### REST API + +{% endpoint GET /admin/persistent/:tenant/:namespace/:destination/internalStats %} + +[More info](../../reference/RestApi#/admin/persistent/:tenant/:namespace/:destination/internalStats) + +#### Java + +```java +String destination = "persistent://my-tenant/my-namespace/my-topic"; +admin.persistentTopics().getInternalStats(destination); +``` + +### Peek messages + +It peeks N messages for a specific subscription of a given topic. + +#### pulsar-admin + + +```shell +$ pulsar-admin persistent peek-messages \ + --count 10 --subscription my-subscription \ + persistent://test-tenant/ns1/tp1 \ + +Message ID: 315674752:0 +Properties: { "X-Pulsar-publish-time" : "2015-07-13 17:40:28.451" } +msg-payload +``` + +#### REST API + +{% endpoint GET /admin/persistent/:tenant/:namespace/:destination/subscription/:subName/position/:messagePosition %} + +[More info](../../reference/RestApi#/admin/persistent/:tenant/:namespace/:destination/subscription/:subName/position/:messagePosition) + +#### Java + +```java +String destination = "persistent://my-tenant/my-namespace/my-topic"; +String subName = "my-subscription"; +int numMessages = 1; +admin.persistentTopics().peekMessages(destination, subName, numMessages); +``` + +### Skip messages + +It skips N messages for a specific subscription of a given topic. + +#### pulsar-admin + + +```shell +$ pulsar-admin persistent skip \ + --count 10 --subscription my-subscription \ + persistent://test-tenant/ns1/tp1 \ +``` + +#### REST API + +{% endpoint POST /admin/persistent/:tenant/:namespace/:destination/subscription/:subName/skip/:numMessages %} + +[More info](../../reference/RestApi#/admin/persistent/:tenant/:namespace/:destination/subscription/:subName/skip/:numMessages) + +#### Java + +```java +String destination = "persistent://my-tenant/my-namespace/my-topic"; +String subName = "my-subscription"; +int numMessages = 1; +admin.persistentTopics().skipMessages(destination, subName, numMessages); +``` + +### Skip all messages + +It skips all old messages for a specific subscription of a given topic. + +#### pulsar-admin + + +```shell +$ pulsar-admin persistent skip-all \ + --subscription my-subscription \ + persistent://test-tenant/ns1/tp1 \ +``` + +#### REST API + +{% endpoint POST /admin/persistent/:tenant/:namespace/:destination/subscription/:subName/skip_all %} + +[More info](../../reference/RestApi#/admin/persistent/:tenant/:namespace/:destination/subscription/:subName/skip_all) + +#### Java + +```java +String destination = "persistent://my-tenant/my-namespace/my-topic"; +String subName = "my-subscription"; +admin.persistentTopics().skipAllMessages(destination, subName); +``` + +### Reset cursor + +It resets a subscription’s cursor position back to the position which was recorded X minutes before. It essentially calculates time and position of cursor at X minutes before and resets it at that position. + +#### pulsar-admin + + +```shell +$ pulsar-admin persistent reset-cursor \ + --subscription my-subscription --time 10 \ + persistent://test-tenant/ns1/tp1 \ +``` + +#### REST API + +{% endpoint POST /admin/persistent/:tenant/:namespace/:destination/subscription/:subName/resetcursor/:timestamp %} + +[More info](../../reference/RestApi#/admin/persistent/:tenant/:namespace/:destination/subscription/:subName/resetcursor/:timestamp) + +#### Java + +```java +String destination = "persistent://my-tenant/my-namespace/my-topic"; +String subName = "my-subscription"; +long timestamp = 2342343L; +admin.persistentTopics().skipAllMessages(destination, subName, timestamp); +``` + +### Lookup of topic + +It locates broker url which is serving the given topic. + +#### pulsar-admin + + +```shell +$ pulsar-admin persistent lookup \ + persistent://test-tenant/ns1/tp1 \ + + "pulsar://broker1.org.com:4480" +``` + +#### REST API + +{% endpoint GET /lookup/v2/destination/persistent/:tenant:namespace/:destination %} + +#### Java + +```java +String destination = "persistent://my-tenant/my-namespace/my-topic"; +admin.lookup().lookupDestination(destination); +``` + +### Get bundle + +It gives range of the bundle which contains given topic + +#### pulsar-admin + + +```shell +$ pulsar-admin persistent bundle-range \ + persistent://test-tenant/ns1/tp1 \ + + "0x00000000_0xffffffff" +``` + +#### REST API + +{% endpoint GET /lookup/v2/destination/:destination_domain/:tenant/:namespace/:destination/bundle %} + +#### Java + +```java +String destination = "persistent://my-tenant/my-namespace/my-topic"; +admin.lookup().getBundleRange(destination); +``` + + +### Get subscriptions + +It shows all subscription names for a given topic. + +#### pulsar-admin + +```shell +$ pulsar-admin persistent subscriptions \ + persistent://test-tenant/ns1/tp1 \ + + my-subscription +``` + +#### REST API + +{% endpoint GET /admin/persistent/:tenant/:namespace/:destination/subscriptions %} + +[More info](../../reference/RestApi#/admin/persistent/:tenant/:namespace/:destination/subscriptions) + +#### Java + +```java +String destination = "persistent://my-tenant/my-namespace/my-topic"; +admin.persistentTopics().getSubscriptions(destination); +``` + +### Unsubscribe + +It can also help to unsubscribe a subscription which is no more processing further messages. + +#### pulsar-admin + + +```shell +$ pulsar-admin persistent unsubscribe \ + --subscription my-subscription \ + persistent://test-tenant/ns1/tp1 \ +``` + +#### REST API + +{% endpoint POST /admin/namespaces/:tenant/:namespace/unsubscribe/:subscription %} + +[More info](../../reference/RestApi#/admin/namespaces/:tenant/:namespace/unsubscribe/:subscription) + +#### Java + +```java +String destination = "persistent://my-tenant/my-namespace/my-topic"; +String subscriptionName = "my-subscription"; +admin.persistentTopics().deleteSubscription(destination, subscriptionName); +``` diff --git a/site2/docs/admin-api-tenants.md b/site2/docs/admin-api-tenants.md new file mode 100644 index 0000000000000000000000000000000000000000..da2e9195c3f6cecb729094404332aa556b96b6f2 --- /dev/null +++ b/site2/docs/admin-api-tenants.md @@ -0,0 +1,85 @@ +--- +id: admin-api-tenants +title: Managing Tenants +sidebar_label: Tenants +--- + +Tenants, like namespaces, can be managed using the [admin API](admin-api-overview.md). There are currently two configurable aspects of tenants: + +* Admin roles +* Allowed clusters + +## Tenant resources + +### List + +#### pulsar-admin + +You can list all of the tenants associated with an {% popover instance %} using the [`list`](reference-pulsar-admin.md#tenants-list) subcommand: + +```shell +$ pulsar-admin tenants list +``` + +That will return a simple list, like this: + +``` +my-tenant-1 +my-tenant-2 +``` + +### Create + +#### pulsar-admin + +You can create a new tenant using the [`create`](reference-pulsar-admin.md#tenants-create) subcommand: + +```shell +$ pulsar-admin tenants create my-tenant +``` + +When creating a tenant, you can assign admin roles using the `-r`/`--admin-roles` flag. You can specify multiple roles as a comma-separated list. Here are some examples: + +```shell +$ pulsar-admin tenants create my-tenant \ + --admin-roles role1,role2,role3 + +$ pulsar-admin tenants create my-tenant \ + -r role1 +``` + +### Get configuration + +#### pulsar-admin + +You can see a tenant's configuration as a JSON object using the [`get`](reference-pulsar-admin.md#tenants-get) subcommand and specifying the name of the tenant: + +```shell +$ pulsar-admin tenants get my-tenant +{ + "adminRoles": [ + "admin1", + "admin2" + ], + "allowedClusters": [ + "cl1", + "cl2" + ] +} +``` + +### Delete + +#### pulsar-adnin + +You can delete a tenant using the [`delete`](reference-pulsar-admin.md#tenants-delete) subcommand and specifying the tenant name: + +```shell +$ pulsar-admin tenants delete my-tenant +``` + +### Updating + +#### pulsar-admin + +You can update a tenant's configuration using the [`update`](reference-pulsar-admin.md#tenants-update) subcommand diff --git a/site2/docs/administration-auth.md b/site2/docs/administration-auth.md new file mode 100644 index 0000000000000000000000000000000000000000..392ce48060cfb36c044e03bbd5983621e88afb06 --- /dev/null +++ b/site2/docs/administration-auth.md @@ -0,0 +1,434 @@ +--- +id: administration-auth +title: Authentication and authorization in Pulsar +sidebar_label: Authentication and authorization +--- + +Pulsar supports a pluggable authentication mechanism that Pulsar clients can use to authenticate with {% popover brokers %}. Pulsar can also be configured to support multiple authentication sources. + +## Role tokens + +In Pulsar, a *role* is a string, like `admin` or `app1`, that can represent a single client or multiple clients. Roles are used to control permission for clients to produce or consume from certain topics, administer the configuration for properties, and more. + +The purpose of the [authentication provider](#authentication-providers) is to establish the identity of the client and then assign that client a *role token*. This role token is then used to determine what the client is authorized to do. + +## Authentication providers + +Out of the box, Pulsar supports two authentication providers: + +* [TLS client auth](#tls-client-auth) +* [Athenz](#athenz) + +### TLS client auth + +In addition to providing connection encryption between Pulsar clients and {% popover brokers %}, [Transport Layer Security](https://en.wikipedia.org/wiki/Transport_Layer_Security) (TLS) can be used to identify clients through a certificate signed by a trusted certificate authority. + +#### Creating certificates + +Creating TLS certificates for Pulsar involves creating a [certificate authority](#certificate-authority) (CA), [broker certificate](#broker-certificate), and [client certificate](#client-certificate). + +##### Certificate authority + +The first step is to create the certificate for the CA. The CA will be used to sign both the broker and client certificates, in order to ensure that each party will trust the others. + +###### Linux + +```bash +$ CA.pl -newca +``` + +###### macOS + +```bash +$ /System/Library/OpenSSL/misc/CA.pl -newca +``` + +After answering the question prompts, this will store CA-related files in the `./demoCA` directory. Within that directory: + +* `demoCA/cacert.pem` is the public certificate. It is meant to be distributed to all parties involved. +* `demoCA/private/cakey.pem` is the private key. This is only needed when signing a new certificate for either broker or clients and it must be safely guarded. + +##### Broker certificate + +Once a CA certificate has been created, you can create certificate requests and sign them with the CA. + +The following commands will ask you a few questions and then create the certificates. When asked for the common name, you need to match the hostname of the broker. You could also use a wildcard to match a group of broker hostnames, for example `*.broker.usw.example.com`. This ensures that the same certificate can be reused on multiple machines. + +```shell +$ openssl req \ + -newkey rsa:2048 \ + -sha256 \ + -nodes \ + -out broker-cert.csr \ + -outform PEM +``` + +Convert the key to [PKCS 8](https://en.wikipedia.org/wiki/PKCS_8) format: + +```shell +$ openssl pkcs8 \ + -topk8 \ + -inform PEM \ + -outform PEM \ + -in privkey.pem \ + -out broker-key.pem \ + -nocrypt +``` + +This will create two broker certificate files named `broker-cert.csr` and `broker-key.pem`. Now you can create the signed certificate: + +```shell +$ openssl ca \ + -out broker-cert.pem \ + -infiles broker-cert.csr +``` + +At this point, you should have a `broker-cert.pem` and `broker-key.pem` file. These will be needed for the broker. + +##### Client certificate + +To create a client certificate, repeat the steps in the previous section, but did create `client-cert.pem` and `client-key.pem` files instead. + +For the client common name, you need to use a string that you intend to use as the [role token](#role-tokens) for this client, though it doesn't need to match the client hostname. + +#### Configure the broker for TLS + +To configure a Pulsar broker to use TLS authentication, you'll need to make some changes to the `broker.conf` configuration file, which is located in the `conf` directory of your [Pulsar installation](getting-started-standalone.md). + +Add these values to the configuration file (substituting the appropriate certificate paths where necessary): + +```properties +# Enable TLS and point the broker to the right certs +tlsEnabled=true +tlsCertificateFilePath=/path/to/broker-cert.pem +tlsKeyFilePath=/path/to/broker-key.pem +tlsTrustCertsFilePath=/path/to/cacert.pem + +# Enable the TLS auth provider +authenticationEnabled=true +authorizationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderTls +``` + +> A full listing of parameters available in the `conf/broker.conf` file, as well as the default values for those parameters, can be found in Broker Configuration. + + +#### Configure the discovery service + +The discovery service used by Pulsar brokers needs to redirect all HTTPS requests, which means that it needs to be trusted by the client as well. Add this configuration in `conf/discovery.conf` in your Pulsar installation: + +```properties +tlsEnabled=true +tlsCertificateFilePath=/path/to/broker-cert.pem +tlsKeyFilePath=/path/to/broker-key.pem +``` + +#### Configure clients + +For more information on Pulsar client authentication using TLS, see the following language-specific docs: + +* [Java client](client-libraries-java.md) +* [C++ client](client-libraries-cpp.md) + +#### Configure CLI tools + +[Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-pulsar-admin.md), [`pulsar-perf`](reference-cli-tools.md#pulsar-perf), and [`pulsar-client`](reference-cli-tools.md#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + +You'll need to add the following authentication parameters to that file to use TLS with Pulsar's CLI tools: + +```properties +serviceUrl=https://broker.example.com:8443/ +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationTls +authParams=tlsCertFile:/path/to/client-cert.pem,tlsKeyFile:/path/to/client-key.pem +useTls=true +tlsAllowInsecureConnection=false +tlsTrustCertsFilePath=/path/to/cacert.pem +``` + +### Athenz + +[Athenz](https://github.com/yahoo/athenz) is a role-based authentication/authorization system. In Pulsar, Athenz [role tokens](#role-tokens) (aka *z-tokens*) can be used to establish the identify of the client. + +#### Athenz authentication settings + +In a [decentralized Athenz system](https://github.com/yahoo/athenz/blob/master/docs/dev_decentralized_access.md) there is both an [authori**Z**ation **M**anagement **S**ystem](https://github.com/yahoo/athenz/blob/master/docs/setup_zms.md) (ZMS) server and an [authori**Z**ation **T**oken **S**ystem](https://github.com/yahoo/athenz/blob/master/docs/setup_zts.md) (ZTS) server. + +To begin, you need to set up Athenz service access control. You should create domains for the *provider* (which provides some resources to other services with some authentication/authorization policies) and the *tenant* (which is provisioned to access some resources in a provider). In this case, the provider corresponds to the Pulsar service itself and the tenant corresponds to each application using Pulsar (typically, a property in Pulsar). + +##### Create the tenant domain and service + +On the tenant side, you need to: + +1. Create a domain, such as `shopping` +2. Generate a private/public key pair +3. Create a service, such as `some_app`, on the domain with the public key + +Note that the private key generated in step 2 needs to be specified when the Pulsar client connects to the broker (see client configuration examples for [Java](client-libraries-java.md#tls-authentication) and [C++](client-libraries-cpp.md#tls-authentication)). + +For more specific steps involving the Athenz UI, please refer to [this doc](https://github.com/yahoo/athenz/blob/master/docs/example_service_athenz_setup.md#client-tenant-domain). + +##### Create the provider domain and add the tenant service to some role members + +On the provider side, you need to: + +1. Create a domain, such as `pulsar` +2. Create a role +3. Add the tenant service to members of the role + +Note that in step 2 any action and resource can be specified since they are not used on Pulsar. In other words, Pulsar uses the Athenz role token only for authentication, *not* for authorization. + +For more specific steps involving UI, please refer to [this doc](https://github.com/yahoo/athenz/blob/master/docs/example_service_athenz_setup.md#server-provider-domain). + +#### Configure the broker for Athenz + +{% include message.html id="tls_role_tokens" %} + +> #### TLS encryption strongly recommended +> Please note that using TLS encryption is strongly recommended when using Athenz as an authentication provider, as it can protect role tokens from being intercepted and reused (see also [this doc]()). + + +In the `conf/broker.conf` configuration file in your Pulsar installation, you need to provide the class name of the Athenz authentication provider as well as a comma-separated list of provider domain names. + +```properties +# Add the Athenz auth provider +authenticationEnabled=true +authorizationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderAthenz +athenzDomainNames=pulsar + +# Enable TLS +tlsEnabled=true +tlsCertificateFilePath=/path/to/broker-cert.pem +tlsKeyFilePath=/path/to/broker-key.pem +``` + +> A full listing of parameters available in the conf/broker.conf file, as well as the default values for those parameters, can be found in [Broker Configuration](). + +#### Configure clients for Athenz + +For more information on Pulsar client authentication using Athenz, see the following language-specific docs: + +* [Java client](client-libraries-java.md#athenz) + +#### Configure CLI tools for Athenz + +[Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-pulsar-admin.md), [`pulsar-perf`](reference-cli-tools.md#pulsar-perf), and [`pulsar-client`](reference-cli-tools.md#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + +You’ll need to add the following authentication parameters to that file to use Athenz with Pulsar’s CLI tools: + +```properties +# URL for the broker +serviceUrl=https://broker.example.com:8443/ + +# Set Athenz auth plugin and its parameters +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationAthenz +authParams={"tenantDomain":"shopping","tenantService":"some_app","providerDomain":"pulsar","privateKey":"file:///path/to/private.pem","keyId":"v1"} + +# Enable TLS +useTls=true +tlsAllowInsecureConnection=false +tlsTrustCertsFilePath=/path/to/cacert.pem +``` + +## Authorization + +In Pulsar, the [authentication provider](#authentication-providers) is charged with properly identifying clients and associating them with [role tokens](#role-tokens). *Authorization* is the process that determines *what* clients are able to do. + +Authorization in Pulsar is managed at the tenant level, which means that you can have multiple authorization schemes active in a single Pulsar instance. You could, for example, create a `shopping` tenant that has one set of [roles](#role-tokens) and applies to a shopping application used by your company, while an `inventory` tenant would be used only by an inventory application. + +> When working with properties, you can specify which of your Pulsar clusters your tenant is allowed to use. This enables you to also have cluster-level authorization schemes. + +## Creating a new tenant + +A Pulsar tenant is typically provisioned by Pulsar {% popover instance %} administrators or by some kind of self-service portal. + +Properties are managed using the [`pulsar-admin`](reference-pulsar-admin.md) tool. Here's an example property creation command: + +```shell +$ bin/pulsar-admin tenants create my-tenant \ + --admin-roles my-admin-role \ + --allowed-clusters us-west,us-east +``` + +This command will create a new property `my-tenant` that will be allowed to use the clusters `us-west` and `us-east`. + +A client that successfully identified itself as having the role `my-admin-role` would then be allowed to perform all administrative tasks on this tenant. + +The structure of topic names in Pulsar reflects the hierarchy between tenants, clusters, and [namespaces](#managing-namespaces): + +```http +persistent://tenant/namespace/topic +``` + +## Managing permissions + +Permissions in Pulsar are managed at the namespace level (that is, within tenants and clusters). + +### Grant permissions + +You can grant permissions to specific roles for lists of operations such as `produce` and `consume`. + +#### pulsar-admin + +Use the [`grant-permission`](reference-pulsar-admin.md#namespaces-grant-permission) subcommand and specify a namespace, actions using the `--actions` flag, and a role using the `--role` flag: + +```shell +$ pulsar-admin namespaces grant-permission test-tenant/ns1 \ + --actions produce,consume \ + --role admin10 +``` + +Wildcard authorization can be performed when `authorizationAllowWildcardsMatching` is set to `true` in `broker.conf`. + +e.g. +```shell +$ pulsar-admin namespaces grant-permission test-tenant/ns1 \ + --actions produce,consume \ + --role 'my.role.*' +``` + +Then, roles `my.role.1`, `my.role.2`, `my.role.foo`, `my.role.bar`, etc. can produce and consume. + +```shell +$ pulsar-admin namespaces grant-permission test-tenant/ns1 \ + --actions produce,consume \ + --role '*.role.my' +``` + +Then, roles `1.role.my`, `2.role.my`, `foo.role.my`, `bar.role.my`, etc. can produce and consume. + +**Note**: A wildcard matching works at **the beginning or end of the role name only**. + +e.g. +```shell +$ pulsar-admin namespaces grant-permission test-tenant/ns1 \ + --actions produce,consume \ + --role 'my.*.role' +``` + +In this case, only the role `my.*.role` has permissions. +Roles `my.1.role`, `my.2.role`, `my.foo.role`, `my.bar.role`, etc. **cannot** produce and consume. + +#### REST API + +```http +POST /admin/v2/namespaces/:tenant/:namespace/permissions/:role +``` +[More info](reference-rest-api.md#/admin/namespaces/:property/:cluster/:namespace/permissions/:role) + +#### Java + +```java +admin.namespaces().grantPermissionOnNamespace(namespace, role, getAuthActions(actions)); +``` + +### Get permission + +You can see which permissions have been granted to which roles in a namespace. + +#### pulsar-admin + +Use the [`permissions`](reference-pulsar-admin.md#namespaces-permissions) subcommand and specify a namespace: + +```shell +$ pulsar-admin namespaces permissions test-tenant/ns1 +{ + "admin10": [ + "produce", + "consume" + ] +} +``` + +#### REST API + +```http +GET /admin/v2/namespaces/:tenant/:namespace/permissions +``` + +[More info](reference-rest-api.md#/admin/namespaces/:property/:cluster/:namespace/permissions) + +#### Java + +```java +admin.namespaces().getPermissions(namespace); +``` + +### Revoke permissions + +You can revoke permissions from specific roles, which means that those roles will no longer have access to the specified namespace. + +#### pulsar-admin + +Use the [`revoke-permission`](reference-pulsar-admin.md#revoke-permission) subcommand and specify a namespace and a role using the `--role` flag: + +```shell +$ pulsar-admin namespaces revoke-permission test-tenant/ns1 \ + --role admin10 +``` + +#### REST API + +```http +DELETE /admin/v2/namespaces/:tenant/:namespace/permissions/:role +``` + +[More info](reference-rest-api.md#/admin/namespaces/:property/:cluster/:namespace/permissions/:role) + +#### Java + +```java +admin.namespaces().revokePermissionsOnNamespace(namespace, role); +``` + + +## Superusers + +In Pulsar you can assign certain roles to be *superusers* of the system. A superuser is allowed to perform all administrative tasks on all properties and namespaces, as well as to publish and subscribe to all topics. + +Superusers are configured in the broker configuration file in [`conf/broker.conf`](reference-configuration.md#broker) configuration file, using the [`superUserRoles`](reference-configuration.md#broker-superUserRoles) parameter: + +```properties +superUserRoles=my-super-user-1,my-super-user-2 +``` + +> A full listing of parameters available in the `conf/broker.conf` file, as well as the default values for those parameters, can be found in [Broker Configuration](). + +Typically, superuser roles are used for administrators and clients but also for broker-to-broker authorization. When using [geo-replication](administration-geo.md), every broker +needs to be able to publish to other clusters' topics. + +## Pulsar admin authentication + +```java +String authPluginClassName = "com.org.MyAuthPluginClass"; +String authParams = "param1:value1"; +boolean useTls = false; +boolean tlsAllowInsecureConnection = false; +String tlsTrustCertsFilePath = null; + +ClientConfiguration config = new ClientConfiguration(); +config.setAuthentication(authPluginClassName, authParams); +config.setUseTls(useTls); +config.setTlsAllowInsecureConnection(tlsAllowInsecureConnection); +config.setTlsTrustCertsFilePath(tlsTrustCertsFilePath); + +PulsarAdmin admin = new PulsarAdmin(url, config); +``` + +To use TLS: + +```java +String authPluginClassName = "com.org.MyAuthPluginClass"; +String authParams = "param1:value1"; +boolean useTls = false; +boolean tlsAllowInsecureConnection = false; +String tlsTrustCertsFilePath = null; + +ClientConfiguration config = new ClientConfiguration(); +config.setAuthentication(authPluginClassName, authParams); +config.setUseTls(useTls); +config.setTlsAllowInsecureConnection(tlsAllowInsecureConnection); +config.setTlsTrustCertsFilePath(tlsTrustCertsFilePath); + +PulsarAdmin admin = new PulsarAdmin(url, config); +``` diff --git a/site2/docs/administration-dashboard.md b/site2/docs/administration-dashboard.md new file mode 100644 index 0000000000000000000000000000000000000000..e4e3625f0d1d641c0be29c715fd7a32096fd4a78 --- /dev/null +++ b/site2/docs/administration-dashboard.md @@ -0,0 +1,38 @@ +--- +id: administration-dashboard +title: The Pulsar dashboard +sidebar_label: Dashboard +--- + +The Pulsar dashboard is a web application that enables users to monitor current stats for all {% popover topics %} in tabular form. + +The dashboard is a data collector that polls stats from all the brokers in a Pulsar instance (across multiple clusters) and stores all the information in a [PostgreSQL](https://www.postgresql.org/) database. + +A [Django](https://www.djangoproject.com) web app is used to render the collected data. + +## Install + +The easiest way to use the dashboard is to run it inside a [Docker](https://www.docker.com/products/docker) container. A [`Dockerfile`](pulsar:repo_url/dashboard/Dockerfile) to generate the image is provided. + +To generate the Docker image: + +```shell +$ docker build -t pulsar-dashboard dashboard +``` + +To run the dashboard: + +```shell +$ SERVICE_URL=http://broker.example.com:8080/ +$ docker run -p 80:80 \ + -e SERVICE_URL=$SERVICE_URL \ + apachepulsar/pulsar-dashboard +``` + +You need to specify only one service URL for a Pulsar cluster. Internally, the collector will figure out all the existing clusters and the brokers from where it needs to pull the metrics. If you're connecting the dashboard to Pulsar running in standalone mode, the URL will be `http://localhost:8080` by default. + +Once the Docker container is running, the web dashboard will be accessible via `localhost` or whichever host is being used by Docker. + +### Known issues + +Pulsar [authentication](administration-auth.md#authentication-providers) is not supported at this point. The dashboard's data collector does not pass any authentication-related data and will be denied access if the Pulsar broker requires authentication. diff --git a/site2/docs/administration-geo.md b/site2/docs/administration-geo.md new file mode 100644 index 0000000000000000000000000000000000000000..08c7b3ee29afaeb1da167d2a517c26cd1affd263 --- /dev/null +++ b/site2/docs/administration-geo.md @@ -0,0 +1,124 @@ +--- +id: administration-geo +title: Pulsar geo-replication +sidebar_label: Geo-replication +--- + +*Geo-replication* is the replication of persistently stored message data across multiple clusters of a Pulsar instance. + +## How it works + +The diagram below illustrates the process of geo-replication across Pulsar clusters: + +![Replication Diagram](/docs/assets/geo-replication.png) + +In this diagram, whenever producers **P1**, **P2**, and **P3** publish messages to the topic **T1** on clusters **Cluster-A**, **Cluster-B**, and **Cluster-C**, respectively, those messages are instantly replicated across clusters. Once replicated, consumers **C1** and **C2** can consume those messages from their respective clusters. + +Without geo-replication, consumers **C1** and **C2** wouldn't be able to consume messages published by producer **P3**. + +## Geo-replication and Pulsar properties + +Geo-replication must be enabled on a per-tenant basis in Pulsar. Geo-replication can be enabled between clusters only when a property has been created that allows access to both clusters. + +Although geo-replication must be enabled between two clusters, it's actually managed at the namespace level. You must do the following to enable geo-replication for a namespace: + +* [Create a global namespace](#creating-global-namespaces) +* Configure that namespace to replicate between two or more provisioned clusters + +Any message published on *any* topic in that namespace will then be replicated to all clusters in the specified set. + +## Local persistence and forwarding + +When messages are produced on a Pulsar topic, they are first persisted in the local cluster and then forwarded asynchronously to the remote clusters. + +In normal cases, when there are no connectivity issues, messages are replicated immediately, at the same time as they are dispatched to local consumers. Typically, end-to-end delivery latency is defined by the network [round-trip time](https://en.wikipedia.org/wiki/Round-trip_delay_time) (RTT) between the remote regions. + +Applications can create producers and consumers in any of the clusters, even when the remote clusters are not reachable (like during a network partition). + +> #### Subscriptions are local to a cluster +> While producers and consumers can publish to and consume from any cluster in a Pulsar instance, subscriptions are local to the clusters in which they are created and cannot be transferred between clusters. If you do need to transfer a subscription, you’ll need to create a new subscription in the desired cluster. + +In the example in the image above, the topic **T1** is being replicated between 3 clusters, **Cluster-A**, **Cluster-B**, and **Cluster-C**. + +All messages produced in any cluster will be delivered to all subscriptions in all the other clusters. In this case, consumers **C1** and **C2** will receive all messages published by producers **P1**, **P2**, and **P3**. Ordering is still guaranteed on a per-producer basis. + +## Configuring replication + +As stated [above](#geo-replication-and-pulsar-properties), geo-replication in Pulsar is managed at the {% popover property %} level. + +### Granting permissions to properties + +To establish replication to a cluster, the tenant needs permission to use that cluster. This permission can be granted when the property is created or later on. + +At creation time, specify all the intended clusters: + +```shell +$ bin/pulsar-admin properties create my-property \ + --admin-roles my-admin-role \ + --allowed-clusters us-west,us-east,us-cent +``` + +To update permissions of an existing property, use `update` instead of `create`. + +### Creating global namespaces + +Replication must be used with *global* topics, meaning topics that belong to a global namespace and are thus not tied to any particular cluster. + +Global namespaces need to be created in the `global` virtual cluster. For example: + +```shell +$ bin/pulsar-admin namespaces create my-tenant/my-namespace +``` + +Initially, the namespace is not assigned to any cluster. You can assign the namespace to clusters using the `set-clusters` subcommand: + +```shell +$ bin/pulsar-admin namespaces set-clusters my-tenant/my-namespace \ + --clusters us-west,us-east,us-cent +``` + +The replication clusters for a namespace can be changed at any time, with no disruption to ongoing traffic. Replication channels will be immediately set up or stopped in all the clusters as soon as the configuration changes. + +### Using global topics + +Once you've created a global namespace, any topics that producers or consumers create within that namespace will be global. Typically, each application will use the `serviceUrl` for the local cluster. + +#### Selective replication + +By default, messages are replicated to all clusters configured for the namespace. You can restrict replication selectively by specifying a replication list for a message. That message will then be replicated only to the subset in the replication list. + +Below is an example for the [Java API](client-libraries-java.md). Note the use of the `setReplicationClusters` method when constructing the {% javadoc Message client org.apache.pulsar.client.api.Message %} object: + +```java +List restrictReplicationTo = Arrays.asList( + "us-west", + "us-east" +); + +Producer producer = client.newProducer() + .topic("some-topic") + .create(); + +producer.newMessage() + .value("my-payload".getBytes()) + .setReplicationClusters(restrictReplicationTo) + .send(); +``` + +#### Topic stats + +Topic-specific statistics for global topics are available via the [`pulsar-admin`](reference-pulsar-admin.md) tool and [REST API](reference-rest-api.md): + +```shell +$ bin/pulsar-admin persistent stats persistent://my-tenant/my-namespace/my-topic +``` + +Each cluster reports its own local stats, including incoming and outgoing replication rates and backlogs. + +#### Deleting a global topic + +Given that global topics exist in multiple regions, it's not possible to directly delete a global topic. Instead, you should rely on automatic topic garbage collection. + +In Pulsar, a topic is automatically deleted when it's no longer used, that is to say, when no producers or consumers are connected *and* there are no subscriptions. For global topics, each region will use a fault-tolerant mechanism to decide when it's safe to delete the topic locally. + +To delete a global topic, close all producers and consumers on the topic and delete all its local subscriptions in every replication cluster. When Pulsar determines that no valid subscription for the topic remains across the system, it will garbage collect the topic. diff --git a/site2/docs/administration-load-distribution.md b/site2/docs/administration-load-distribution.md new file mode 100644 index 0000000000000000000000000000000000000000..a8a2dc96ace4c77e841a0f65227c0696c690f60f --- /dev/null +++ b/site2/docs/administration-load-distribution.md @@ -0,0 +1,217 @@ +--- +id: administration-load-distribution +title: Pulsar load distribution +sidebar_label: Load distribution +--- + +## Load distribution across Pulsar brokers + +Pulsar is an horizontally scalable messaging system, so it is a core requirement that the traffic +in a logical cluster must be spread across all the available Pulsar brokers, as evenly as possible. + +In most cases, this is true out of the box and one shouldn't worry about it. There are, though, +multiple settings and tools to control the traffic distribution and they require a bit of +context to understand how the traffic is managed in Pulsar. + +## Pulsar load manager architecture + +### Dynamic assignment of topics to brokers + +Topics are dynamically assigned to brokers based on the load conditions of all brokers in the +cluster. + +When a clients starts using new topics that are not assigned to any broker, it will trigger a +process that, given the load conditions, it will choose the best suited broker to acquire ownership +of such topic. + +In case of partitioned topics, different partitions might be assigned to different brokers. We talk +about "topic" in this context to mean either a non-partitioned topic or one partition of a topic. + +The assignment is "dynamic" because it can change very quickly. For example, if the broker owning +the topic crashes, the topic will be reassigned immediately to another broker. Another scenario is +that the broker owning the topic becomes overloaded. In this case too, the topic will be +reassigned to a less loaded broker. + +The dynamic assignment is made possible by the stateless nature of brokers. This also ensure that +we can quickly expand or shrink the cluster based on usage. + +### Assignment granularity + +The assignment of topics/partitions to brokers is not done at the individual level. The reason for +it is to amortize the amount of information that we need to keep track (eg. which topics are +assigned to a particular broker, what's the load on topics for a broker and similar). + +Instead of individual topic/partition assignment, each broker takes ownership of a subset of the +topics for a namespace. This subset is called a "*bundle*" and effectively it's a sharding +mechanism. + +The namespace is the "administrative" unit: many config knobs or operations are done at the +namespace level. + +For assignment, a namespaces is sharded into a list of "bundles", with each bundle comprising +a portion of overall hash range of the namespace. + +Topics are assigned to a particular bundle by taking the hash of the topic name and seeing in which +bundle the hash falls into. + +Each bundle is independent of the others and thus is independently assigned to different brokers. + +### Creating namespaces and bundles + +When creating a new namespace, it will set to use the default number of bundles. This is set in +`conf/broker.conf`: + +```properties +# When a namespace is created without specifying the number of bundle, this +# value will be used as the default +defaultNumberOfNamespaceBundles=4 +``` + +One can either change the system default, or override it when creating a new namespace: + +```shell +$ bin/pulsar-admin namespaces create my-tenant/my-namespace --clusters us-west --bundles 16 +``` + +With this command, we're creating a namespace with 16 initial bundles. Therefore the topics for +this namespaces can immediately be spread across up to 16 brokers. + +In general, if the expected traffic and number of topics is known in advance, it's a good idea to +start with a reasonable number of bundles instead of waiting for the system to auto-correct the +distribution. + +On a same note, it is normally beneficial to start with more bundles than number of brokers, +primarily because of the hashing nature of the distribution of topics into bundles. For example, +for a namespace with 1000 topics, using something like 64 bundles will achieve a good distribution +of traffic across 16 brokers. + +### Unloading topics and bundles + +In Pulsar there is an admin operation of "unloading" a topic. Unloading means to close the topics, +release ownership and reassign the topics to a new broker, based on current load. + +When unload happens, the client will experience a small latency blip, typically in the order of +tens of milliseconds, while the topic is reassigned. + +Unloading is the mechanism used by the load-manager to perform the load shedding, but it can +also be triggered manually, for example to correct the assignments and redistribute traffic +even before having any broker overloaded. + +Unloading a topic has no effect on the assignment, but it will just close and reopen the +particular topic: + +```shell +pulsar-admin topics unload persistent://tenant/namespace/topic +``` + +To unload all topics for a namespace and trigger reassignments: + +```shell +pulsar-admin namespaces unload tenant/namespace +``` + +### Namespace bundles splitting + +Since the load for the topics in a bundle might change over time, or could just be hard to predict +upfront, bundles can be split in 2 by brokers. The new smaller bundles can then be reassigned +to different brokers. + +The splitting happens based on some tunable thresholds. Any existing bundle that exceeds any +of the threshold is a candidate to be split. By default the newly split bundles are also +immediately offloaded to other brokers, to facilitate the traffic distribution. + +```properties +# enable/disable namespace bundle auto split +loadBalancerAutoBundleSplitEnabled=true + +# enable/disable automatic unloading of split bundles +loadBalancerAutoUnloadSplitBundlesEnabled=true + +# maximum topics in a bundle, otherwise bundle split will be triggered +loadBalancerNamespaceBundleMaxTopics=1000 + +# maximum sessions (producers + consumers) in a bundle, otherwise bundle split will be triggered +loadBalancerNamespaceBundleMaxSessions=1000 + +# maximum msgRate (in + out) in a bundle, otherwise bundle split will be triggered +loadBalancerNamespaceBundleMaxMsgRate=30000 + +# maximum bandwidth (in + out) in a bundle, otherwise bundle split will be triggered +loadBalancerNamespaceBundleMaxBandwidthMbytes=100 + +# maximum number of bundles in a namespace (for auto-split) +loadBalancerNamespaceMaximumBundles=128 +``` + + +### Automatic load shedding + +In Pulsar's load manager there is support for automatic load shedding. This means that whenever +the system recognized a particular broker is overloaded, it will force some traffic to be +reassigned to less loaded brokers. + +When a broker is identifies as overloaded, it will force to "unload" a subset of the bundles, the +ones with higher traffic, that make up for the overload percentage. + +For example, the default threshold is 85% and if a broker is over quota at 95% CPU usage, then +it will unload the percent difference plus a 5% margin: `(95% - 85%) + 5% = 15%`. + +Given the selection of bundles to offload is based on traffic (as a proxy measure for cpu, network +and memory), broker will unload bundles for at least 15% of traffic. + +The automatic load shedding is enabled by default and can be disabled with this setting: + +```properties +# Enable/disable automatic bundle unloading for load-shedding +loadBalancerSheddingEnabled=true +``` + +There are additional settings that apply to shedding: + +```properties +# Load shedding interval. Broker periodically checks whether some traffic should be offload from +# some over-loaded broker to other under-loaded brokers +loadBalancerSheddingIntervalMinutes=1 + +# Prevent the same topics to be shed and moved to other brokers more that once within this timeframe +loadBalancerSheddingGracePeriodMinutes=30 +``` + +#### Broker overload thresholds + +The determinations of when a broker is overloaded is based on threshold of CPU, network and +memory usage. Whenever either of those metrics reaches the threshold, it will trigger the shedding +(if enabled). + +By default, overload threshold is set at 85%: + +```properties +# Usage threshold to determine a broker as over-loaded +loadBalancerBrokerOverloadedThresholdPercentage=85 +``` + +The usage stats are gathered by Pulsar from the system metrics. + +In case of network utilization, in some cases the network interface speed reported by Linux is +not correct and needs to be manually overridden. This is the case in AWS EC2 instances with 1Gbps +NIC speed for which the OS report 10Gbps speed. + +Because of the incorrect max speed, the Pulsar load manager might think the broker has not +reached the NIC capacity, while in fact it's already using all the bandwidth and the traffic is +being slowed down. + +There is a setting to correct the max NIC speed: + +```properties +# Override the auto-detection of the network interfaces max speed. +# This option is useful in some environments (eg: EC2 VMs) where the max speed +# reported by Linux is not reflecting the real bandwidth available to the broker. +# Since the network usage is employed by the load manager to decide when a broker +# is overloaded, it is important to make sure the info is correct or override it +# with the right value here. The configured value can be a double (eg: 0.8) and that +# can be used to trigger load-shedding even before hitting on NIC limits. +loadBalancerOverrideBrokerNicSpeedGbps= +``` + +When the value is empty, Pulsar will use the value reported by the OS. + diff --git a/site2/docs/administration-proxy.md b/site2/docs/administration-proxy.md new file mode 100644 index 0000000000000000000000000000000000000000..064869f32f42cdd11a259edc5206b9c2d678bad9 --- /dev/null +++ b/site2/docs/administration-proxy.md @@ -0,0 +1,66 @@ +--- +id: administration-proxy +title: The Pulsar proxy +sidebar_label: Pulsar proxy +--- + +The [Pulsar proxy](getting-started-concepts-and-architecture.md#pulsar-proxy) is an optional gateway that you can run over the brokers in a Pulsar cluster. We recommend running a Pulsar proxy in cases when direction connections between clients and Pulsar brokers are either infeasible, undesirable, or both, for example when running Pulsar in a cloud environment or on [Kubernetes](https://kubernetes.io) or an analogous platform. + +## Running the proxy + +In order to run the Pulsar proxy, you need to have both a local [ZooKeeper](https://zookeeper.apache.org) and configuration store quorum set up for use by your Pulsar cluster. For instructions, see [this document](deploy-bare-metal.md). Once you have ZooKeeper set up and have connection strings for both ZooKeeper quorums, you can use the [`proxy`](reference-cli-tools.md#pulsar-proxy) command of the [`pulsar`](reference-cli-tools.md#pulsar) CLI tool to start up the proxy (preferably on its own machine or in its own VM): + +To start the proxy: + +```bash +$ cd /path/to/pulsar/directory +$ bin/pulsar proxy \ + --zookeeper-servers zk-0,zk-1,zk-2 \ + --global-zookeeper-servers zk-0,zk-1,zk-2 +``` + +> You can run as many instances of the Pulsar proxy in a cluster as you would like. + + +## Stopping the proxy + +The Pulsar proxy runs by default in the foreground. To stop the proxy, simply stop the process in which it's running. + +## Proxy frontends + +We recommend running the Pulsar proxy behind some kind of load-distributing frontend, such as an [HAProxy](https://www.digitalocean.com/community/tutorials/an-introduction-to-haproxy-and-load-balancing-concepts) load balancer. + +## Using Pulsar clients with the proxy + +Once your Pulsar proxy is up and running, preferably behind a load-distributing [frontend](#proxy-frontends), clients can connect to the proxy via whichever address is used by the frontend. If the address were the DNS address `pulsar.cluster.default`, for example, then the connection URL for clients would be `pulsar://pulsar.cluster.default:6650`. + +## Proxy configuration + +The Pulsar proxy can be configured using the [`proxy.conf`](reference-configuration.md#proxy) configuration file. The following parameters are available in that file: + +|Name|Description|Default| +|---|---|---| +|zookeeperServers| The ZooKeeper quorum connection string (as a comma-separated list) || +|configurationStoreServers| Configuration store connection string (as a comma-separated list) || +|zookeeperSessionTimeoutMs| ZooKeeper session timeout (in milliseconds) |30000| +|servicePort| The port to use for server binary Protobuf requests |6650| +|servicePortTls| The port to use to server binary Protobuf TLS requests |6651| +|statusFilePath | Path for the file used to determine the rotation status for the proxy instance when responding to service discovery health checks || +|authenticationEnabled| Whether authentication is enabled for the Pulsar proxy |false| +|authenticationProviders| Authentication provider name list (a comma-separated list of class names) || +|authorizationEnabled| Whether authorization is enforced by the Pulsar proxy |false| +|authorizationProvider| Authorization provider as a fully qualified class name |org.apache.pulsar.broker.authorization.PulsarAuthorizationProvider| +|brokerClientAuthenticationPlugin| The authentication plugin used by the Pulsar proxy to authenticate with Pulsar brokers || +|brokerClientAuthenticationParameters| The authentication parameters used by the Pulsar proxy to authenticate with Pulsar brokers || +|brokerClientTrustCertsFilePath| The path to trusted certificates used by the Pulsar proxy to authenticate with Pulsar brokers || +|superUserRoles| Role names that are treated as “super-users,” meaning that they will be able to perform all admin || +|forwardAuthorizationCredentials| Whether client authorization credentials are forwared to the broker for re-authorization. Authentication must be enabled via authenticationEnabled=true for this to take effect. |false| +|maxConcurrentInboundConnections| Max concurrent inbound connections. The proxy will reject requests beyond that. |10000| +|maxConcurrentLookupRequests| Max concurrent outbound connections. The proxy will error out requests beyond that. |10000| +|tlsEnabledInProxy| Whether TLS is enabled for the proxy |false| +|tlsEnabledWithBroker| Whether TLS is enabled when communicating with Pulsar brokers |false| +|tlsCertificateFilePath| Path for the TLS certificate file || +|tlsKeyFilePath| Path for the TLS private key file || +|tlsTrustCertsFilePath| Path for the trusted TLS certificate pem file || +|tlsHostnameVerificationEnabled| Whether the hostname is validated when the proxy creates a TLS connection with brokers |false| +|tlsRequireTrustedClientCertOnConnect| Whether client certificates are required for TLS. Connections are rejected if the client certificate isn’t trusted. |false| diff --git a/site2/docs/administration-stats.md b/site2/docs/administration-stats.md new file mode 100644 index 0000000000000000000000000000000000000000..454084d395557d311e0263854ca3b8e280316e0e --- /dev/null +++ b/site2/docs/administration-stats.md @@ -0,0 +1,63 @@ +--- +id: administration-stats +title: Pulsar stats +sidebar_label: Pulsar statistics +--- + +## Partitioned topics + +|Stat|Description| +|---|---| +|msgRateIn| The sum of all local and replication publishers’ publish rates in messages per second| +|msgThroughputIn| Same as msgRateIn but in bytes per second instead of messages per second| +|msgRateOut| The sum of all local and replication consumers’ dispatch rates in messages per second| +|msgThroughputOut| Same as msgRateOut but in bytes per second instead of messages per second| +|averageMsgSize| Average message size, in bytes, from this publisher within the last interval| +|storageSize| The sum of the ledgers’ storage size for this topic| +|publishers| The list of all local publishers into the topic. There can be anywhere from zero to thousands.| +|producerId| Internal identifier for this producer on this topic| +|producerName| Internal identifier for this producer, generated by the client library| +|address| IP address and source port for the connection of this producer| +|connectedSince| Timestamp this producer was created or last reconnected| +|subscriptions| The list of all local subscriptions to the topic| +|my-subscription| The name of this subscription (client defined)| +|msgBacklog| The count of messages in backlog for this subscription| +|type| This subscription type| +|msgRateExpired| The rate at which messages were discarded instead of dispatched from this subscription due to TTL| +|consumers| The list of connected consumers for this subscription| +|consumerName| Internal identifier for this consumer, generated by the client library| +|availablePermits| The number of messages this consumer has space for in the client library’s listen queue. A value of 0 means the client library’s queue is full and receive() isn’t being called. A nonzero value means this consumer is ready to be dispatched messages.| +|replication| This section gives the stats for cross-colo replication of this topic| +|replicationBacklog| The outbound replication backlog in messages| +|connected| Whether the outbound replicator is connected| +|replicationDelayInSeconds| How long the oldest message has been waiting to be sent through the connection, if connected is true| +|inboundConnection| The IP and port of the broker in the remote cluster’s publisher connection to this broker| +|inboundConnectedSince| The TCP connection being used to publish messages to the remote cluster. If there are no local publishers connected, this connection is automatically closed after a minute.| + + +## Topics + +|Stat|Description| +|---|---| +|entriesAddedCounter| Messages published since this broker loaded this topic| +|numberOfEntries| Total number of messages being tracked| +|totalSize| Total storage size in bytes of all messages| +|currentLedgerEntries| Count of messages written to the ledger currently open for writing| +|currentLedgerSize| Size in bytes of messages written to ledger currently open for writing| +|lastLedgerCreatedTimestamp| Time when last ledger was created| +|lastLedgerCreationFailureTimestamp| time when last ledger was failed| +|waitingCursorsCount| How many cursors are caught up and waiting for a new message to be published| +|pendingAddEntriesCount| How many messages have (asynchronous) write requests we are waiting on completion| +|lastConfirmedEntry| The ledgerid:entryid of the last message successfully written. If the entryid is -1, then the ledger has been opened or is currently being opened but has no entries written yet.| +|state| The state of the cursor ledger. Open means we have a cursor ledger for saving updates of the markDeletePosition.| +|ledgers| The ordered list of all ledgers for this topic holding its messages| +|cursors| The list of all cursors on this topic. There will be one for every subscription you saw in the topic stats.| +|markDeletePosition| The ack position: the last message the subscriber acknowledged receiving| +|readPosition| The latest position of subscriber for reading message| +|waitingReadOp| This is true when the subscription has read the latest message published to the topic and is waiting on new messages to be published.| +|pendingReadOps| The counter for how many outstanding read requests to the BookKeepers we have in progress| +|messagesConsumedCounter| Number of messages this cursor has acked since this broker loaded this topic| +|cursorLedger| The ledger being used to persistently store the current markDeletePosition| +|cursorLedgerLastEntry| The last entryid used to persistently store the current markDeletePosition| +|individuallyDeletedMessages| If Acks are being done out of order, shows the ranges of messages Acked between the markDeletePosition and the read-position| +|lastLedgerSwitchTimestamp| The last time the cursor ledger was rolled over| diff --git a/site2/docs/administration-zk-bk.md b/site2/docs/administration-zk-bk.md new file mode 100644 index 0000000000000000000000000000000000000000..cf0eb3a079273bb5fd11536544aadebf13af091b --- /dev/null +++ b/site2/docs/administration-zk-bk.md @@ -0,0 +1,334 @@ +--- +id: administration-zk-bk +title: ZooKeeper and BookKeeper administration +sidebar_label: ZooKeeper and BookKeeper +--- + +Pulsar relies on two external systems for essential tasks: + +* [ZooKeeper](https://zookeeper.apache.org/) is responsible for a wide variety of configuration- and coordination-related tasks. +* [BookKeeper](http://bookkeeper.apache.org/) is responsible for [persistent storage](getting-started-concepts-and-architecture.md#persistent-storage) of message data. + +ZooKeeper and BookKeeper are both open-source [Apache](https://www.apache.org/) projects. + +> Skip to the [How Pulsar uses ZooKeeper and BookKeeper](#how-pulsar-uses-zookeeper-and-bookkeeper) section below for a more schematic explanation of the role of these two systems in Pulsar. +' %} + +## ZooKeeper + +Each Pulsar instance relies on two separate ZooKeeper quorums. + +* [Local ZooKeeper](#deploying-local-zookeeper) operates at the cluster level and provides cluster-specific configuration management and coordination. Each Pulsar cluster needs to have a dedicated ZooKeeper cluster. +* [Global ZooKeeper](#deploying-global-zookeeper) operates at the instance level and provides configuration management for the entire system (and thus across clusters). The global ZooKeeper quorum can be provided by an independent cluster of machines or by the same machines used by local ZooKeeper. + +### Deploying local ZooKeeper + +ZooKeeper manages a variety of essential coordination- and configuration-related tasks for Pulsar. + +Deploying a Pulsar instance requires you to stand up one local ZooKeeper cluster *per Pulsar cluster*. + +To begin, add all ZooKeeper servers to the quorum configuration specified in the [`conf/zookeeper.conf`](reference-configuration.md#zookeeper) file. Add a `server.N` line for each node in the cluster to the configuration, where `N` is the number of the ZooKeeper node. Here's an example for a three-node cluster: + +```properties +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=zk3.us-west.example.com:2888:3888 +``` + +On each host, you need to specify the ID of the node in each node's `myid` file, which is in each server's `data/zookeeper` folder by default (this can be changed via the [`dataDir`](reference-configuration.md#zookeeper-dataDir) parameter). + +> See the [Multi-server setup guide](https://zookeeper.apache.org/doc/r3.4.10/zookeeperAdmin.html#sc_zkMulitServerSetup) in the ZooKeeper documentation for detailed info on `myid` and more. + + +On a ZooKeeper server at `zk1.us-west.example.com`, for example, you could set the `myid` value like this: + +```shell +$ mkdir -p data/zookeeper +$ echo 1 > data/zookeeper/myid +``` + +On `zk2.us-west.example.com` the command would be `echo 2 > data/zookeeper/myid` and so on. + +Once each server has been added to the `zookeeper.conf` configuration and has the appropriate `myid` entry, you can start ZooKeeper on all hosts (in the background, using nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```shell +$ bin/pulsar-daemon start zookeeper +``` + +### Deploying the configuration store {#configuration-store} + +The ZooKeeper cluster configured and started up in the section above is a *local* ZooKeeper cluster used to manage a single Pulsar cluster. In addition to a local cluster, however, a full Pulsar instance also requires a configuration store for handling some instance-level configuration and coordination tasks. + +If you're deploying a [single-cluster](#single-cluster-pulsar-instance) instance, then you will not need a separate cluster for the configuration store. If, however, you're deploying a [multi-cluster](#multi-cluster-pulsar-instance) instance, then you should stand up a separate ZooKeeper cluster for configuration tasks. + +#### Single-cluster Pulsar instance + +If your Pulsar instance will consist of just one cluster, then you can deploy a configuration store on the same machines as the local ZooKeeper quorum but running on different TCP ports. + +To deploy a ZooKeeper configuration store in a single-cluster instance, add the same ZooKeeper servers used by the local quorom to the configuration file in [`conf/global_zookeeper.conf`](reference-configuration.md#configuration-store) using the same method for [local ZooKeeper](#local-zookeeper), but make sure to use a different port (2181 is the default for ZooKeeper). Here's an example that uses port 2184 for a three-node ZooKeeper cluster: + +```properties +clientPort=2184 +server.1=zk1.us-west.example.com:2185:2186 +server.2=zk2.us-west.example.com:2185:2186 +server.3=zk3.us-west.example.com:2185:2186 +``` + +As before, create the `myid` files for each server on `data/global-zookeeper/myid`. + +#### Multi-cluster Pulsar instance + +When deploying a global Pulsar instance, with clusters distributed across different geographical regions, the global ZooKeeper serves as a highly available and strongly consistent metadata store that can tolerate failures and partitions spanning whole regions. + +The key here is to make sure the ZK quorum members are spread across at least 3 +regions and that other regions are running as observers. + +Again, given the very low expected load on the global ZooKeeper servers, we can +share the same hosts used for the local ZooKeeper quorum. + +For example, let's assume a Pulsar instance with the following clusters `us-west`, +`us-east`, `us-central`, `eu-central`, `ap-south`. Also let's assume, each cluster +will have its own local ZK servers named such as + +``` +zk[1-3].${CLUSTER}.example.com +``` + +In this scenario we want to pick the quorum participants from few clusters and +let all the others be ZK observers. For example, to form a 7 servers quorum, we +can pick 3 servers from `us-west`, 2 from `us-central` and 2 from `us-east`. + +This will guarantee that writes to global ZooKeeper will be possible even if one +of these regions is unreachable. + +The ZK configuration in all the servers will look like: + +```properties +clientPort=2184 +server.1=zk1.us-west.example.com:2185:2186 +server.2=zk2.us-west.example.com:2185:2186 +server.3=zk3.us-west.example.com:2185:2186 +server.4=zk1.us-central.example.com:2185:2186 +server.5=zk2.us-central.example.com:2185:2186 +server.6=zk3.us-central.example.com:2185:2186:observer +server.7=zk1.us-east.example.com:2185:2186 +server.8=zk2.us-east.example.com:2185:2186 +server.9=zk3.us-east.example.com:2185:2186:observer +server.10=zk1.eu-central.example.com:2185:2186:observer +server.11=zk2.eu-central.example.com:2185:2186:observer +server.12=zk3.eu-central.example.com:2185:2186:observer +server.13=zk1.ap-south.example.com:2185:2186:observer +server.14=zk2.ap-south.example.com:2185:2186:observer +server.15=zk3.ap-south.example.com:2185:2186:observer +``` + +Additionally, ZK observers will need to have: + +```properties +peerType=observer +``` + +##### Starting the service + +Once your global ZooKeeper configuration is in place, you can start up the service using [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) + +```shell +$ bin/pulsar-daemon start global-zookeeper +``` + + + +### ZooKeeper configuration + +In Pulsar, ZooKeeper configuration is handled by two separate configuration files found in the `conf` directory of your Pulsar installation: `conf/zookeeper.conf` for [local ZooKeeper](#local-zookeeper) and `conf/global-zookeeper.conf` for [global ZooKeeper](#global-zookeeper). + +#### Local ZooKeeper + +Configuration for local ZooKeeper is handled by the [`conf/zookeeper.conf`](reference-configuration.md#zookeeper) file. The table below shows the available parameters: + +{% include config.html id="zookeeper" %} + + +|Name|Description|Default| +|---|---|---| +|tickTime| The tick is the basic unit of time in ZooKeeper, measured in milliseconds and used to regulate things like heartbeats and timeouts. tickTime is the length of a single tick. |2000| +|initLimit| The maximum time, in ticks, that the leader ZooKeeper server allows follower ZooKeeper servers to successfully connect and sync. The tick time is set in milliseconds using the tickTime parameter. |10| +|syncLimit| The maximum time, in ticks, that a follower ZooKeeper server is allowed to sync with other ZooKeeper servers. The tick time is set in milliseconds using the tickTime parameter. |5| +|dataDir| The location where ZooKeeper will store in-memory database snapshots as well as the transaction log of updates to the database. |data/zookeeper| +|clientPort| The port on which the ZooKeeper server will listen for connections. |2181| +|autopurge.snapRetainCount| In ZooKeeper, auto purge determines how many recent snapshots of the database stored in dataDir to retain within the time interval specified by autopurge.purgeInterval (while deleting the rest). |3| +|autopurge.purgeInterval| The time interval, in hours, by which the ZooKeeper database purge task is triggered. Setting to a non-zero number will enable auto purge; setting to 0 will disable. Read this guide before enabling auto purge. |1| +|maxClientCnxns| The maximum number of client connections. Increase this if you need to handle more ZooKeeper clients. |60| + + +#### Global ZooKeeper + +Configuration for global ZooKeeper is handled by the [`conf/global-zookeeper.conf`](reference-configuration.md#global-zookeeper) file. The table below shows the available parameters: + + +## BookKeeper + +BookKeeper is responsible for all durable message storage in Pulsar. BookKeeper is a distributed [write-ahead log](https://en.wikipedia.org/wiki/Write-ahead_logging) WAL system that guarantees read consistency of independent message logs called ledgers. Individual BookKeeper servers are also called *bookies*. + +> For a guide to managing message persistence, retention, and expiry in Pulsar, see [this cookbook](cookbooks-retention-expiry.md). + +### Deploying BookKeeper + +{% include explanations/deploying-bk.md %} + +BookKeeper provides [persistent message storage](getting-started-concepts-and-architecture.md#persistent-storage) for Pulsar. + +Each Pulsar broker needs to have its own cluster of bookies. The BookKeeper cluster shares a local ZooKeeper quorum with the Pulsar cluster. + +### Configuring bookies + +BookKeeper bookies can be configured using the [`conf/bookkeeper.conf`](reference-configuration.md#bookkeeper) configuration file. The most important aspect of configuring each bookie is ensuring that the [`zkServers`](reference-configuration.md#bookkeeper-zkServers) parameter is set to the connection string for the Pulsar cluster's local ZooKeeper. + +### Starting up bookies + +You can start up a bookie in two ways: in the foreground or as a background daemon. + +To start up a bookie in the foreground, use the [`bookeeper`](reference-cli-tools.md#bookkeeper) + +```shell +$ bin/pulsar-daemon start bookie +``` + +You can verify that the bookie is working properly using the `bookiesanity` command for the [BookKeeper shell](reference-cli-tools.md#bookkeeper-shell): + +```shell +$ bin/bookkeeper shell bookiesanity +``` + +This will create a new ledger on the local bookie, write a few entries, read them back and finally delete the ledger. + +### Hardware considerations + +Bookie hosts are responsible for storing message data on disk. In order for bookies to provide optimal performance, it's essential that they have a suitable hardware configuration. There are two key dimensions to bookie hardware capacity: + +* Disk I/O capacity read/write +* Storage capacity + +Message entries written to bookies are always synced to disk before returning an acknowledgement to the Pulsar broker. To ensure low write latency, BookKeeper is +designed to use multiple devices: + +* A **journal** to ensure durability. For sequential writes, it's critical to have fast [fsync](https://linux.die.net/man/2/fsync) operations on bookie hosts. Typically, small and fast [solid-state drives](https://en.wikipedia.org/wiki/Solid-state_drive) (SSDs) should suffice, or [hard disk drives](https://en.wikipedia.org/wiki/Hard_disk_drive) (HDDs) with a [RAID](https://en.wikipedia.org/wiki/RAID)s controller and a battery-backed write cache. Both solutions can reach fsync latency of ~0.4 ms. +* A **ledger storage device** is where data is stored until all consumers have acknowledged the message. Writes will happen in the background, so write I/O is not a big concern. Reads will happen sequentially most of the time and the backlog is drained only in case of consumer drain. To store large amounts of data, a typical configuration will involve multiple HDDs with a RAID controller. + + + +### Configuring BookKeeper + +Configurable parameters for BookKeeper bookies can be found in the [`conf/bookkeeper.conf`](reference-configuration.md#bookkeeper) file. + +Minimum configuration changes required in `conf/bookkeeper.conf` are: + +```properties +# Change to point to journal disk mount point +journalDirectory=data/bookkeeper/journal + +# Point to ledger storage disk mount point +ledgerDirectories=data/bookkeeper/ledgers + +# Point to local ZK quorum +zkServers=zk1.example.com:2181,zk2.example.com:2181,zk3.example.com:2181 + +# Change the ledger manager type +ledgerManagerType=hierarchical +``` + +> Consult the official [BookKeeper docs](http://bookkeeper.apache.org) for more information about BookKeeper. + +## BookKeeper persistence policies + +In Pulsar, you can set *persistence policies*, at the namespace level, that determine how BookKeeper handles persistent storage of messages. Policies determine four things: + +* The number of acks (guaranteed copies) to wait for each ledger entry +* The number of bookies to use for a topic +* How many writes to make for each ledger entry +* The throttling rate for mark-delete operations + +### Set persistence policies + +You can set persistence policies for BookKeeper at the {% popover namespace %} level. + +#### pulsar-admin + +Use the [`set-persistence`](reference-pulsar-admin.md#namespaces-set-persistence) subcommand and specify a namespace as well as any policies that you want to apply. The available flags are: + +Flag | Description | Default +:----|:------------|:------- +`-a`, `--bookkeeper-ack-quorom` | The number of acks (guaranteed copies) to wait on for each entry | 0 +`-e`, `--bookkeeper-ensemble` | The number of {% popover bookies %} to use for topics in the namespace | 0 +`-w`, `--bookkeeper-write-quorum` | How many writes to make for each entry | 0 +`-r`, `--ml-mark-delete-max-rate` | Throttling rate for mark-delete operations (0 means no throttle) | 0 + +##### Example + +```shell +$ pulsar-admin namespaces set-persistence my-tenant/my-ns \ + --bookkeeper-ack-quorom 3 \ + --bookeeper-ensemble 2 +``` + +#### REST API + +```http +POST /admin/v2/namespaces/:tenant/:namespace/persistence +``` + +[More info](reference-rest-api.md#/admin/namespaces/:property/:cluster/:namespace/persistence) + +#### Java + +```java +int bkEnsemble = 2; +int bkQuorum = 3; +int bkAckQuorum = 2; +double markDeleteRate = 0.7; +PersistencePolicies policies = + new PersistencePolicies(ensemble, quorum, ackQuorum, markDeleteRate); +admin.namespaces().setPersistence(namespace, policies); +``` + +### List persistence policies + +You can see which persistence policy currently applies to a namespace. + +#### pulsar-admin + +Use the [`get-persistence`](reference-pulsar-admin.md#namespaces-get-persistence) subcommand and specify the namespace. + +##### Example + +```shell +$ pulsar-admin namespaces get-persistence my-tenant/my-ns +{ + "bookkeeperEnsemble": 1, + "bookkeeperWriteQuorum": 1, + "bookkeeperAckQuorum", 1, + "managedLedgerMaxMarkDeleteRate": 0 +} +``` + +#### REST API + +```http +GET /admin/v2/namespaces/:tenant/:namespace/persistence +``` + +[More info](reference-rest-api.md#/admin/namespaces/:property/:cluster/:namespace/persistence) + +#### Java + +```java +PersistencePolicies policies = admin.namespaces().getPersistence(namespace); +``` + +## How Pulsar uses ZooKeeper and BookKeeper + +This diagram illustrates the role of ZooKeeper and BookKeeper in a Pulsar cluster: + +![ZooKeeper and BookKeeper](/docs/assets/pulsar-system-architecture.png) + +Each Pulsar cluster consists of one or more message brokers. Each broker relies on an ensemble of bookies. diff --git a/site2/docs/assets/binary-protocol-connect.png b/site2/docs/assets/binary-protocol-connect.png new file mode 100644 index 0000000000000000000000000000000000000000..e6b0d6ae7d03c7762d180a4a71910cf7ecb08e2b Binary files /dev/null and b/site2/docs/assets/binary-protocol-connect.png differ diff --git a/site2/docs/assets/binary-protocol-consumer.png b/site2/docs/assets/binary-protocol-consumer.png new file mode 100644 index 0000000000000000000000000000000000000000..b52638537936e17089df9abd3e97cb678dbd5fb1 Binary files /dev/null and b/site2/docs/assets/binary-protocol-consumer.png differ diff --git a/site2/docs/assets/binary-protocol-producer.png b/site2/docs/assets/binary-protocol-producer.png new file mode 100644 index 0000000000000000000000000000000000000000..e971737be85a5b902f41e66d388b407b46adfd09 Binary files /dev/null and b/site2/docs/assets/binary-protocol-producer.png differ diff --git a/site2/docs/assets/binary-protocol-topic-lookup.png b/site2/docs/assets/binary-protocol-topic-lookup.png new file mode 100644 index 0000000000000000000000000000000000000000..2fd855154954835b755c32db4ba1a3a9d2e3a924 Binary files /dev/null and b/site2/docs/assets/binary-protocol-topic-lookup.png differ diff --git a/site2/docs/assets/broker-bookie.png b/site2/docs/assets/broker-bookie.png new file mode 100644 index 0000000000000000000000000000000000000000..c866159c2b112ac56261e6d88dfc6f52224d97e8 Binary files /dev/null and b/site2/docs/assets/broker-bookie.png differ diff --git a/site2/docs/assets/dcos_bookie_log.png b/site2/docs/assets/dcos_bookie_log.png new file mode 100644 index 0000000000000000000000000000000000000000..31ac263847ba967d91aa6121ae577ef054e28840 Binary files /dev/null and b/site2/docs/assets/dcos_bookie_log.png differ diff --git a/site2/docs/assets/dcos_bookkeeper_in_zookeeper.png b/site2/docs/assets/dcos_bookkeeper_in_zookeeper.png new file mode 100644 index 0000000000000000000000000000000000000000..80680bc3307121aae974b64fb45c194713f22522 Binary files /dev/null and b/site2/docs/assets/dcos_bookkeeper_in_zookeeper.png differ diff --git a/site2/docs/assets/dcos_bookkeeper_run.png b/site2/docs/assets/dcos_bookkeeper_run.png new file mode 100644 index 0000000000000000000000000000000000000000..8cda68ca3267e9eb2822b5cb67145127d386cf32 Binary files /dev/null and b/site2/docs/assets/dcos_bookkeeper_run.png differ diff --git a/site2/docs/assets/dcos_bookkeeper_status.png b/site2/docs/assets/dcos_bookkeeper_status.png new file mode 100644 index 0000000000000000000000000000000000000000..5e09a0c027a4f464f4a542ec635a98dbea2b0689 Binary files /dev/null and b/site2/docs/assets/dcos_bookkeeper_status.png differ diff --git a/site2/docs/assets/dcos_broker_in_zookeeper.png b/site2/docs/assets/dcos_broker_in_zookeeper.png new file mode 100644 index 0000000000000000000000000000000000000000..3563e34669bc6b16937a20216c1fc6bc7d08c89f Binary files /dev/null and b/site2/docs/assets/dcos_broker_in_zookeeper.png differ diff --git a/site2/docs/assets/dcos_broker_log.png b/site2/docs/assets/dcos_broker_log.png new file mode 100644 index 0000000000000000000000000000000000000000..dfb78a7c0bad5b0ee79340ee6e715b953f7009ed Binary files /dev/null and b/site2/docs/assets/dcos_broker_log.png differ diff --git a/site2/docs/assets/dcos_broker_run.png b/site2/docs/assets/dcos_broker_run.png new file mode 100644 index 0000000000000000000000000000000000000000..9afeadb57b4c2447077a1b7e6bb9f29e62c373d0 Binary files /dev/null and b/site2/docs/assets/dcos_broker_run.png differ diff --git a/site2/docs/assets/dcos_broker_status.png b/site2/docs/assets/dcos_broker_status.png new file mode 100644 index 0000000000000000000000000000000000000000..d42f233dd718f33e6ad83e9aa179cdd1e02fa414 Binary files /dev/null and b/site2/docs/assets/dcos_broker_status.png differ diff --git a/site2/docs/assets/dcos_command_execute.png b/site2/docs/assets/dcos_command_execute.png new file mode 100644 index 0000000000000000000000000000000000000000..a5c4c4281abd4291db76adcb88b3da5504091cd7 Binary files /dev/null and b/site2/docs/assets/dcos_command_execute.png differ diff --git a/site2/docs/assets/dcos_command_execute2.png b/site2/docs/assets/dcos_command_execute2.png new file mode 100644 index 0000000000000000000000000000000000000000..5670cd0a9b26cdc1b6fbcae1cca99a61d9aaaa3c Binary files /dev/null and b/site2/docs/assets/dcos_command_execute2.png differ diff --git a/site2/docs/assets/dcos_consumer.png b/site2/docs/assets/dcos_consumer.png new file mode 100644 index 0000000000000000000000000000000000000000..8b9b64afb4472d703d1b43a0272e70677a8edef8 Binary files /dev/null and b/site2/docs/assets/dcos_consumer.png differ diff --git a/site2/docs/assets/dcos_grafana_dashboard.png b/site2/docs/assets/dcos_grafana_dashboard.png new file mode 100644 index 0000000000000000000000000000000000000000..b937003dd399f5e50997faa545f4be1f7b8f1a38 Binary files /dev/null and b/site2/docs/assets/dcos_grafana_dashboard.png differ diff --git a/site2/docs/assets/dcos_grafana_endpoint.png b/site2/docs/assets/dcos_grafana_endpoint.png new file mode 100644 index 0000000000000000000000000000000000000000..20e5894435d4d08bc82b6effe56dd12c72a42da0 Binary files /dev/null and b/site2/docs/assets/dcos_grafana_endpoint.png differ diff --git a/site2/docs/assets/dcos_metrics.png b/site2/docs/assets/dcos_metrics.png new file mode 100644 index 0000000000000000000000000000000000000000..7e0651265449fd744ff3d6dd736891fe779153bc Binary files /dev/null and b/site2/docs/assets/dcos_metrics.png differ diff --git a/site2/docs/assets/dcos_monitor_status.png b/site2/docs/assets/dcos_monitor_status.png new file mode 100644 index 0000000000000000000000000000000000000000..bfc208923703372c0902d429c4326fd36b6c5da3 Binary files /dev/null and b/site2/docs/assets/dcos_monitor_status.png differ diff --git a/site2/docs/assets/dcos_producer.png b/site2/docs/assets/dcos_producer.png new file mode 100644 index 0000000000000000000000000000000000000000..21a7cfcdf2c0bfd6c0cef08da8c46c9ea2f7b6ce Binary files /dev/null and b/site2/docs/assets/dcos_producer.png differ diff --git a/site2/docs/assets/dcos_prom_endpoint.png b/site2/docs/assets/dcos_prom_endpoint.png new file mode 100644 index 0000000000000000000000000000000000000000..36c9b8c83e2151b41407d4bc08115a42d8e04fc1 Binary files /dev/null and b/site2/docs/assets/dcos_prom_endpoint.png differ diff --git a/site2/docs/assets/dcos_prom_targets.png b/site2/docs/assets/dcos_prom_targets.png new file mode 100644 index 0000000000000000000000000000000000000000..0d362f3284ffa1bbac6c0fd1e848829e4866ffbe Binary files /dev/null and b/site2/docs/assets/dcos_prom_targets.png differ diff --git a/site2/docs/assets/dcos_uninstall.png b/site2/docs/assets/dcos_uninstall.png new file mode 100644 index 0000000000000000000000000000000000000000..4ef4f569fa2950eb2b9b0433b3501a2a37d04a8c Binary files /dev/null and b/site2/docs/assets/dcos_uninstall.png differ diff --git a/site2/docs/assets/geo-replication.png b/site2/docs/assets/geo-replication.png new file mode 100644 index 0000000000000000000000000000000000000000..f913d54d24cdd16e4ea474832d9fa88e40cb5edc Binary files /dev/null and b/site2/docs/assets/geo-replication.png differ diff --git a/site2/docs/assets/message-deduplication.png b/site2/docs/assets/message-deduplication.png new file mode 100644 index 0000000000000000000000000000000000000000..23e3e6009d5f3e8deb56189f39363a1f6d6f8ffc Binary files /dev/null and b/site2/docs/assets/message-deduplication.png differ diff --git a/site2/docs/assets/partitioning.png b/site2/docs/assets/partitioning.png new file mode 100644 index 0000000000000000000000000000000000000000..b0494522b3df679578609e280b9c87a64bbfdda3 Binary files /dev/null and b/site2/docs/assets/partitioning.png differ diff --git a/site2/docs/assets/pulsar-basic-setup.png b/site2/docs/assets/pulsar-basic-setup.png new file mode 100644 index 0000000000000000000000000000000000000000..bb85eb910e6f4ca4d101098ce60c421c04e0e2aa Binary files /dev/null and b/site2/docs/assets/pulsar-basic-setup.png differ diff --git a/site2/docs/assets/pulsar-encryption-consumer.jpg b/site2/docs/assets/pulsar-encryption-consumer.jpg new file mode 100644 index 0000000000000000000000000000000000000000..41f3f2489289b192405c0d26fd1e7a8314966a3b Binary files /dev/null and b/site2/docs/assets/pulsar-encryption-consumer.jpg differ diff --git a/site2/docs/assets/pulsar-encryption-producer.jpg b/site2/docs/assets/pulsar-encryption-producer.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1c4050e7e40229ea04841e3d3d8faf3c67b49a1e Binary files /dev/null and b/site2/docs/assets/pulsar-encryption-producer.jpg differ diff --git a/site2/docs/assets/pulsar-exclusive-subscriptions.png b/site2/docs/assets/pulsar-exclusive-subscriptions.png new file mode 100644 index 0000000000000000000000000000000000000000..3d5867b4ed21b7fc2c8c858cb198562c4db34947 Binary files /dev/null and b/site2/docs/assets/pulsar-exclusive-subscriptions.png differ diff --git a/site2/docs/assets/pulsar-failover-subscriptions.png b/site2/docs/assets/pulsar-failover-subscriptions.png new file mode 100644 index 0000000000000000000000000000000000000000..2cf83fc1c5b701bc20b51b29e4f3c49bfb11e2ca Binary files /dev/null and b/site2/docs/assets/pulsar-failover-subscriptions.png differ diff --git a/site2/docs/assets/pulsar-functions-overview.png b/site2/docs/assets/pulsar-functions-overview.png new file mode 100644 index 0000000000000000000000000000000000000000..065046bd63a65954b409685fc31002e304346a53 Binary files /dev/null and b/site2/docs/assets/pulsar-functions-overview.png differ diff --git a/site2/docs/assets/pulsar-functions-routing-example.png b/site2/docs/assets/pulsar-functions-routing-example.png new file mode 100644 index 0000000000000000000000000000000000000000..27a1c4417ac61becf0fe84cd05921533a3e15084 Binary files /dev/null and b/site2/docs/assets/pulsar-functions-routing-example.png differ diff --git a/site2/docs/assets/pulsar-functions-word-count.png b/site2/docs/assets/pulsar-functions-word-count.png new file mode 100644 index 0000000000000000000000000000000000000000..ad0c280938aad55b5b15d6cc07f71ad6e50e9a67 Binary files /dev/null and b/site2/docs/assets/pulsar-functions-word-count.png differ diff --git a/site2/docs/assets/pulsar-io.png b/site2/docs/assets/pulsar-io.png new file mode 100644 index 0000000000000000000000000000000000000000..3e74d4bab73f27d21655e5562d65ac03c91a0d52 Binary files /dev/null and b/site2/docs/assets/pulsar-io.png differ diff --git a/site2/docs/assets/pulsar-reader-consumer-interfaces.png b/site2/docs/assets/pulsar-reader-consumer-interfaces.png new file mode 100644 index 0000000000000000000000000000000000000000..26f05d30260133f5381dd87ca2aa3ee932fa6c0b Binary files /dev/null and b/site2/docs/assets/pulsar-reader-consumer-interfaces.png differ diff --git a/site2/docs/assets/pulsar-service-discovery.png b/site2/docs/assets/pulsar-service-discovery.png new file mode 100644 index 0000000000000000000000000000000000000000..4dc32246177b89dcfab63be7fed88f8cd6dcf7c3 Binary files /dev/null and b/site2/docs/assets/pulsar-service-discovery.png differ diff --git a/site2/docs/assets/pulsar-shared-subscriptions.png b/site2/docs/assets/pulsar-shared-subscriptions.png new file mode 100644 index 0000000000000000000000000000000000000000..13c0dae03789a261cc5aec3896fdfd0d726a0e03 Binary files /dev/null and b/site2/docs/assets/pulsar-shared-subscriptions.png differ diff --git a/site2/docs/assets/pulsar-subscription-modes.png b/site2/docs/assets/pulsar-subscription-modes.png new file mode 100644 index 0000000000000000000000000000000000000000..e8e618b80a03b2015dbfbe53ad3d6e603598a979 Binary files /dev/null and b/site2/docs/assets/pulsar-subscription-modes.png differ diff --git a/site2/docs/assets/pulsar-system-architecture.png b/site2/docs/assets/pulsar-system-architecture.png new file mode 100644 index 0000000000000000000000000000000000000000..7e14381ed7b08ffcf33d46cbde51947a0c26a9cf Binary files /dev/null and b/site2/docs/assets/pulsar-system-architecture.png differ diff --git a/site2/docs/assets/pulsar-tiered-storage.png b/site2/docs/assets/pulsar-tiered-storage.png new file mode 100644 index 0000000000000000000000000000000000000000..f5a250b89c188e50b104d6b4eefe15e3a43a9ed4 Binary files /dev/null and b/site2/docs/assets/pulsar-tiered-storage.png differ diff --git a/site2/docs/assets/retention-expiry.png b/site2/docs/assets/retention-expiry.png new file mode 100644 index 0000000000000000000000000000000000000000..5b9f5fd9461350f4c5b96c3b9fcbd6415b76d1d6 Binary files /dev/null and b/site2/docs/assets/retention-expiry.png differ diff --git a/site2/docs/client-libraries-cpp.md b/site2/docs/client-libraries-cpp.md new file mode 100644 index 0000000000000000000000000000000000000000..e788d2f750bd47d052362c7503797967d348d97d --- /dev/null +++ b/site2/docs/client-libraries-cpp.md @@ -0,0 +1,145 @@ +--- +id: client-libraries-cpp +title: The Pulsar C++ client +sidebar_label: C++ +--- + +## Supported platforms + +The Pulsar C++ client has been successfully tested on **MacOS** and **Linux**. + +## Linux + +There are recipes that build RPM and Debian packages containing a +statically linked `libpulsar.so` / `libpulsar.a` with all the required +dependencies. + +To build the C++ library packages, first build the Java packages: + +```shell +mvn install -DskipTests +``` + +#### RPM + +```shell +pulsar-client-cpp/pkg/rpm/docker-build-rpm.sh +``` + +This will build the RPM inside a Docker container and it will leave the RPMs +in `pulsar-client-cpp/pkg/rpm/RPMS/x86_64/`. + +| Package name | Content | +|-----|-----| +| pulsar-client | Shared library `libpulsar.so` | +| pulsar-client-devel | Static library `libpulsar.a` and C++ and C headers | +| pulsar-client-debuginfo | Debug symbols for `libpulsar.so` | + +#### Deb + +To build Debian packages: + +```shell +pulsar-client-cpp/pkg/deb/docker-build-deb.sh +``` + +Debian packages will be created at `pulsar-client-cpp/pkg/deb/BUILD/DEB/` + +| Package name | Content | +|-----|-----| +| pulsar-client | Shared library `libpulsar.so` | +| pulsar-client-dev | Static library `libpulsar.a` and C++ and C headers | + +## MacOS + +Use the [Homebrew](https://brew.sh/) supplied recipe to build the Pulsar +client lib on MacOS. + +```shell +brew install https://raw.githubusercontent.com/apache/incubator-pulsar/master/pulsar-client-cpp/homebrew/libpulsar.rb +``` + +If using Python 3 on MacOS, add the flag `--with-python3` to the above command. + +This will install the package with the library and headers. + +## Connection URLs + + +To connect to Pulsar using client libraries, you need to specify a Pulsar protocol URL. + +Pulsar protocol URLs are assigned to specific clusters, use the pulsar scheme and have a default port of 6650. Here’s an example for localhost: + +```http +pulsar://localhost:6650 +``` + +A URL for a production Pulsar cluster may look something like this: +```http +pulsar://pulsar.us-west.example.com:6650 +``` + +If you’re using TLS authentication, the URL will look like something like this: +```http +pulsar+ssl://pulsar.us-west.example.com:6651 +``` + +## Consumer + +```c++ +Client client("pulsar://localhost:6650"); + +Consumer consumer; +Result result = client.subscribe("my-topic", "my-subscribtion-name", consumer); +if (result != ResultOk) { + LOG_ERROR("Failed to subscribe: " << result); + return -1; +} + +Message msg; + +while (true) { + consumer.receive(msg); + LOG_INFO("Received: " << msg + << " with payload '" << msg.getDataAsString() << "'"); + + consumer.acknowledge(msg); +} + +client.close(); +``` + + +## Producer + +```c++ +Client client("pulsar://localhost:6650"); + +Producer producer; +Result result = client.createProducer("my-topic", producer); +if (result != ResultOk) { + LOG_ERROR("Error creating producer: " << result); + return -1; +} + +// Publish 10 messages to the topic +for (int i = 0; i < 10; i++){ + Message msg = MessageBuilder().setContent("my-message").build(); + Result res = producer.send(msg); + LOG_INFO("Message sent: " << res); +} +client.close(); +``` + +## Authentication + +```cpp +ClientConfiguration config = ClientConfiguration(); +config.setUseTls(true); +config.setTlsTrustCertsFilePath("/path/to/cacert.pem"); +config.setTlsAllowInsecureConnection(false); +config.setAuth(pulsar::AuthTls::create( + "/path/to/client-cert.pem", "/path/to/client-key.pem");); + +Client client("pulsar+ssl://my-broker.com:6651", config); +``` diff --git a/site2/docs/client-libraries-go.md b/site2/docs/client-libraries-go.md new file mode 100644 index 0000000000000000000000000000000000000000..0f5283b8968830525d60ed21e12eeac9b9f756ac --- /dev/null +++ b/site2/docs/client-libraries-go.md @@ -0,0 +1,452 @@ +--- +id: client-libraries-go +title: The Pulsar Go client +sidebar_label: Go +--- + +The Pulsar Go client can be used to create Pulsar [producers](#producers), [consumers](#consumers), and [readers](#readers) in Go (aka Golang). + +> #### API docs available as well +> For standard API docs, consult the [Godoc](https://godoc.org/github.com/apache/incubator-pulsar/pulsar-client-go/pulsar). + + +## Installation + +### Requirements + +Pulsar Go client library is based on the C++ client library. Follow +the instructions for [C++ library](client-libraries-cpp.md) for installing the binaries +through RPM, Deb or Homebrew packages. + +### Installing go package + +You can install the `pulsar` library locally using `go get`: + +```bash +$ go get -u github.com/apache/incubator-pulsar/pulsar-client-go/pulsar +``` + +Once installed locally, you can import it into your project: + +```go +import "github.com/apache/incubator-pulsar/pulsar-client-go/pulsar" +``` + +## Connection URLs + +To connect to Pulsar using client libraries, you need to specify a [Pulsar protocol](developing-binary-protocol.md) URL. + +Pulsar protocol URLs are assigned to specific clusters, use the `pulsar` scheme and have a default port of 6650. Here's an example for `localhost`: + +```http +pulsar://localhost:6650 +``` + +A URL for a production Pulsar cluster may look something like this: + +```http +pulsar://pulsar.us-west.example.com:6650 +``` + +If you're using [TLS](administration-auth.md#tls-client-auth) authentication, the URL will look like something like this: + +```http +pulsar+ssl://pulsar.us-west.example.com:6651 +``` + +## Creating a client + +In order to interact with Pulsar, you'll first need a `Client` object. You can create a client object using the `NewClient` function, passing in a `ClientOptions` object (more on configuration [below](#client-configuration)). Here's an example: + + +```go +import ( + "log" + "runtime" + + "github.com/apache/incubator-pulsar/pulsar-client-go/pulsar" +) + +func main() { + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + OperationTimeoutSeconds: 5, + MessageListenerThreads: runtime.NumCPU(), + }) + + if err != nil { + log.Fatalf("Could not instantiate Pulsar client: %v", err) + } +} +``` + +The following configurable parameters are available for Pulsar clients: + +Parameter | Description | Default +:---------|:------------|:------- +`URL` | The connection URL for the Pulsar cluster. See [above](#urls) for more info | +`IOThreads` | The number of threads to use for handling connections to Pulsar {% popover brokers %} | 1 +`OperationTimeoutSeconds` | The timeout for some Go client operations (creating producers, subscribing to and unsubscribing from {% popover topics %}). Retries will occur until this threshold is reached, at which point the operation will fail. | 30 +`MessageListenerThreads` | The number of threads used by message listeners ([consumers](#consumers) and [readers](#readers)) | 1 +`ConcurrentLookupRequests` | The number of concurrent lookup requests that can be sent on each broker connection. Setting a maximum helps to keep from overloading brokers. You should set values over the default of 5000 only if the client needs to produce and/or subscribe to thousands of Pulsar topics. | 5000 +`Logger` | A custom logger implementation for the client (as a function that takes a log level, file path, line number, and message). All info, warn, and error messages will be routed to this function. | `nil` +`TLSTrustCertsFilePath` | The file path for the trusted TLS certificate | +`TLSAllowInsecureConnection` | Whether the client accepts untrusted TLS certificates from the broker | `false` +`Authentication` | Configure the authentication provider. (default: no authentication). Example: `Authentication: NewAuthenticationTLS("my-cert.pem", "my-key.pem")` | `nil` +`StatsIntervalInSeconds` | The interval (in seconds) at which client stats are published | 60 + +## Producers + +Pulsar producers publish messages to Pulsar topics. You can [configure](#producer-configuration) Go producers using a `ProducerOptions` object. Here's an example: + +```go +producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: "my-topic", +}) + +if err != nil { + log.Fatalf("Could not instantiate Pulsar producer: %v", err) +} + +defer producer.Close() + +msg := pulsar.ProducerMessage{ + Payload: []byte("Hello, Pulsar"), +} + +if err := producer.Send(msg); err != nil { + log.Fatalf("Producer could not send message: %v", err) +} +``` + +> #### Blocking operation +> When you create a new Pulsar producer, the operation will block (waiting on a go channel) until either a producer is successfully created or an error is thrown. + + +### Producer operations + +Pulsar Go producers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Topic()` | Fetches the producer's {% popover topic %} | `string` +`Name()` | Fetchs the producer's name | `string` +`Send(context.Context, ProducerMessage) error` | Publishes a [message](#messages) to the producer's topic. This call will block until the message is successfully acknowledged by the Pulsar broker, or an error will be thrown if the timeout set using the `SendTimeout` in the producer's [configuration](#producer-configuration) is exceeded. | `error` +`SendAsync(context.Context, ProducerMessage, func(ProducerMessage, error))` | Publishes a [message](#messages) to the producer's topic asynchronously. The third argument is a callback function that specifies what happens either when the message is acknowledged or an error is thrown. | +`Close()` | Closes the producer and releases all resources allocated to it. If `Close()` is called then no more messages will be accepted from the publisher. This method will block until all pending publish requests have been persisted by Pulsar. If an error is thrown, no pending writes will be retried. | `error` + +Here's a more involved example usage of a producer: + +```go +import ( + "context" + "fmt" + + "github.com/apache/incubator-pulsar/pulsar-client-go/pulsar" +) + +func main() { + // Instantiate a Pulsar client + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + }) + + if err != nil { log.Fatal(err) } + + // Use the client to instantiate a producer + producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: "my-topic", + }) + + if err != nil { log.Fatal(err) } + + ctx := context.Background() + + // Send 10 messages synchronously and 10 messages asynchronously + for i := 0; i < 10; i++ { + // Create a message + msg := pulsar.ProducerMessage{ + Payload: []byte(fmt.Sprintf("message-%d", i)), + } + + // Attempt to send the message + if err := producer.Send(ctx, msg); err != nil { + log.Fatal(err) + } + + // Create a different message to send asynchronously + asyncMsg := pulsar.ProducerMessage{ + Payload: []byte(fmt.Sprintf("async-message-%d", i)), + } + + // Attempt to send the message asynchronously and handle the response + producer.SendAsync(ctx, asyncMsg, func(msg pulsar.ProducerMessage, err error) { + if err != nil { log.Fatal(err) } + + fmt.Printf("Message %s succesfully published", msg.ID()) + }) + } +} +``` + +### Producer configuration + +Parameter | Description | Default +:---------|:------------|:------- +`Topic` | The Pulsar {% popover topic %} to which the producer will publish messages | +`Name` | A name for the producer. If you don't explicitly assign a name, Pulsar will automatically generate a globally unique name that you can access later using the `Name()` method. If you choose to explicitly assign a name, it will need to be unique across *all* Pulsar clusters, otherwise the creation operation will throw an error. | +`SendTimeout` | When publishing a message to a topic, the producer will wait for an acknowledgment from the responsible Pulsar {% popover broker %}. If a message is not acknowledged within the threshold set by this parameter, an error will be thrown. If you set `SendTimeout` to -1, the timeout will be set to infinity (and thus removed). Removing the send timeout is recommended when using Pulsar's [message de-duplication](cookbooks-deduplication.md) feature. | 30 seconds +`MaxPendingMessages` | The maximum size of the queue holding pending messages (i.e. messages waiting to receive an acknowledgment from the {% popover broker %}). By default, when the queue is full all calls to the `Send` and `SendAsync` methods will fail *unless* `BlockIfQueueFull` is set to `true`. | +`MaxPendingMessagesAcrossPartitions` | | +`BlockIfQueueFull` | If set to `true`, the producer's `Send` and `SendAsync` methods will block when the outgoing message queue is full rather than failing and throwing an error (the size of that queue is dictated by the `MaxPendingMessages` parameter); if set to `false` (the default), `Send` and `SendAsync` operations will fail and throw a `ProducerQueueIsFullError` when the queue is full. | `false` +`MessageRoutingMode` | The message routing logic (for producers on [partitioned topics](getting-started-concepts-and-architecture.md#partitioned-topics)). This logic is applied only when no key is set on messages. The available options are: round robin (`pulsar.RoundRobinDistribution`, the default), publishing all messages to a single partition (`pulsar.UseSinglePartition`), or a custom partitioning scheme (`pulsar.CustomPartition`). | `pulsar.RoundRobinDistribution` +`HashingScheme` | The hashing function that determines the partition on which a particular message is published (partitioned topics only). The available options are: `pulsar.JavaStringHash` (the equivalent of `String.hashCode()` in Java), `pulsar.Murmur3_32Hash` (applies the [Murmur3](https://en.wikipedia.org/wiki/MurmurHash) hashing function), or `pulsar.BoostHash` (applies the hashing function from C++'s [Boost](https://www.boost.org/doc/libs/1_62_0/doc/html/hash.html) library) | `pulsar.JavaStringHash` +`CompressionType` | The message data compression type used by the producer. The available options are [`LZ4`](https://github.com/lz4/lz4) and [`ZLIB`](https://zlib.net/). | No compression +`MessageRouter` | By default, Pulsar uses a round-robin routing scheme for [partitioned topics](cookbooks-partitioned.md). The `MessageRouter` parameter enables you to specify custom routing logic via a function that takes the Pulsar message and topic metadata as an argument and returns an integer (where the ), i.e. a function signature of `func(Message, TopicMetadata) int`. | + +## Consumers + +Pulsar consumers subscribe to one or more Pulsar topics and listen for incoming messages produced on that topic/those topics. You can [configure](#consumer-configuration) Go consumers using a `ConsumerOptions` object. Here's a basic example that uses channels: + +```go +msgChannel := make(chan pulsar.ConsumerMessage) + +consumerOpts := pulsar.ConsumerOptions{ + Topic: "my-topic", + SubscriptionName: "my-subscription-1", + Type: pulsar.Exclusive, + MessageChannel: msgChannel, +} + +consumer, err := client.Subscribe(consumerOpts) + +if err != nil { + log.Fatalf("Could not establish subscription: %v", err) +} + +defer consumer.Close() + +for cm := range channel { + msg := cm.Message + + fmt.Printf("Message ID: %s", msg.ID()) + fmt.Printf("Message value: %s", string(msg.Payload())) + + consumer.Ack(msg) +} +``` + +> #### Blocking operation +> When you create a new Pulsar consumer, the operation will block (on a go channel) until either a producer is successfully created or an error is thrown. + + +### Consumer operations + +Pulsar Go consumers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Topic()` | Returns the consumer's {% popover topic %} | `string` +`Subscription()` | Returns the consumer's subscription name | `string` +`Unsubcribe()` | Unsubscribes the consumer from the assigned topic. Throws an error if the unsubscribe operation is somehow unsuccessful. | `error` +`Receive(context.Context)` | Receives a single message from the topic. This method blocks until a message is available. | `(Message, error)` +`Ack(Message)` | {% popover Acknowledges %} a message to the Pulsar {% popover broker %} | `error` +`AckID(MessageID)` | {% popover Acknowledges %} a message to the Pulsar {% popover broker %} by message ID | `error` +`AckCumulative(Message)` | {% popover Acknowledges %} *all* the messages in the stream, up to and including the specified message. The `AckCumulative` method will block until the ack has been sent to the broker. After that, the messages will *not* be redelivered to the consumer. Cumulative acking can only be used with a [shared](getting-started-concepts-and-architecture.md#shared) subscription type. +`Close()` | Closes the consumer, disabling its ability to receive messages from the broker | `error` +`RedeliverUnackedMessages()` | Redelivers *all* unacknowledged messages on the topic. In [failover](getting-started-concepts-and-architecture.md#failover) mode, this request is ignored if the consumer isn't active on the specified topic; in [shared](getting-started-concepts-and-architecture.md#shared) mode, redelivered messages are distributed across all consumers connected to the topic. **Note**: this is a *non-blocking* operation that doesn't throw an error. | + +#### Receive example + +Here's an example usage of a Go consumer that uses the `Receive()` method to process incoming messages: + +```go +import ( + "context" + "log" + + "github.com/apache/incubator-pulsar/pulsar-client-go/pulsar" +) + +func main() { + // Instantiate a Pulsar client + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + }) + + if err != nil { log.Fatal(err) } + + // Use the client object to instantiate a consumer + consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topic: "my-golang-topic", + SubscriptionName: "sub-1", + SubscriptionType: pulsar.Exclusive, + }) + + if err != nil { log.Fatal(err) } + + defer consumer.Close() + + ctx := context.Background() + + // Listen indefinitely on the topic + for { + msg, err := consumer.Receive(ctx) + if err != nil { log.Fatal(err) } + + // Do something with the message + + consumer.Ack(msg) + } +} +``` + +### Consumer configuration + +Parameter | Description | Default +:---------|:------------|:------- +`Topic` | The Pulsar {% popover topic %} on which the consumer will establish a subscription and listen for messages | +`SubscriptionName` | The subscription name for this consumer | +`Name` | The name of the consumer | +`AckTimeout` | | 0 +`SubscriptionType` | Available options are `Exclusive`, `Shared`, and `Failover` | `Exclusive` +`MessageChannel` | The Go channel used by the consumer. Messages that arrive from the Pulsar topic(s) will be passed to this channel. | +`ReceiverQueueSize` | Sets the size of the consumer's receiver queue, i.e. the number of messages that can be accumulated by the consumer before the application calls `Receive`. A value higher than the default of 1000 could increase consumer throughput, though at the expense of more memory utilization. | 1000 +`MaxTotalReceiverQueueSizeAcrossPartitions` |Set the max total receiver queue size across partitions. This setting will be used to reduce the receiver queue size for individual partitions if the total exceeds this value | 50000 + +## Readers + +Pulsar readers process messages from Pulsar topics. Readers are different from consumers because with readers you need to explicitly specify which message in the stream you want to begin with (consumers, on the other hand, automatically begin with the most recent unacked message). You can [configure](#reader-configuration) Go readers using a `ReaderOptions` object. Here's an example: + +```go +reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "my-golang-topic", + StartMessageId: pulsar.LatestMessage, +}) +``` + +> #### Blocking operation +> When you create a new Pulsar reader, the operation will block (on a go channel) until either a reader is successfully created or an error is thrown. + + +### Reader operations + +Pulsar Go readers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Topic()` | Returns the reader's {% popover topic %} | `string` +`Next(context.Context)` | Receives the next message on the topic (analogous to the `Receive` method for [consumers](#consumer-operations)). This method blocks until a message is available. | `(Message, error)` +`Close()` | Closes the reader, disabling its ability to receive messages from the broker | `error` + +#### "Next" example + +Here's an example usage of a Go reader that uses the `Next()` method to process incoming messages: + +```go +import ( + "context" + "log" + + "github.com/apache/incubator-pulsar/pulsar-client-go/pulsar" +) + +func main() { + // Instantiate a Pulsar client + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + }) + + if err != nil { log.Fatalf("Could not create client: %v", err) } + + // Use the client to instantiate a reader + reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "my-golang-topic", + StartMessageID: pulsar.EarliestMessage, + }) + + if err != nil { log.Fatalf("Could not create reader: %v", err) } + + defer reader.Close() + + ctx := context.Background() + + // Listen on the topic for incoming messages + for { + msg, err := reader.Next(ctx) + if err != nil { log.Fatalf("Error reading from topic: %v", err) } + + // Process the message + } +} +``` + +In the example above, the reader begins reading from the earliest available message (specified by `pulsar.EarliestMessage`). The reader can also begin reading from the latest message (`pulsar.LatestMessage`) or some other message ID specified by bytes using the `DeserializeMessageID` function, which takes a byte array and returns a `MessageID` object. Here's an example: + +```go +lastSavedId := // Read last saved message id from external store as byte[] + +reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "my-golang-topic", + StartMessageID: DeserializeMessageID(lastSavedId), +}) +``` + +### Reader configuration + +Parameter | Description | Default +:---------|:------------|:------- +`Topic` | The Pulsar {% popover topic %} on which the reader will establish a subscription and listen for messages | +`Name` | The name of the reader | +`StartMessageID` | THe initial reader position, i.e. the message at which the reader begins processing messages. The options are `pulsar.EarliestMessage` (the earliest available message on the topic), `pulsar.LatestMessage` (the latest available message on the topic), or a `MessageID` object for a position that isn't earliest or latest. | +`MessageChannel` | The Go channel used by the reader. Messages that arrive from the Pulsar topic(s) will be passed to this channel. | +`ReceiverQueueSize` | Sets the size of the reader's receiver queue, i.e. the number of messages that can be accumulated by the reader before the application calls `Next`. A value higher than the default of 1000 could increase reader throughput, though at the expense of more memory utilization. | 1000 +`SubscriptionRolePrefix` | The subscription role prefix. | `reader` + +## Messages + +The Pulsar Go client provides a `ProducerMessage` interface that you can use to construct messages to producer on Pulsar topics. Here's an example message: + +```go +msg := pulsar.ProducerMessage{ + Payload: []byte("Here is some message data"), + Key: "message-key", + Properties: map[string]string{ + "foo": "bar", + }, + EventTime: time.Now(), + ReplicationClusters: []string{"cluster1", "cluster3"}, +} + +if err := producer.send(msg); err != nil { + log.Fatalf("Could not publish message due to: %v", err) +} +``` + +The following methods parameters are available for `ProducerMessage` objects: + +Parameter | Description +:---------|:----------- +`Payload` | The actual data payload of the message +`Key` | The optional key associated with the message (particularly useful for things like topic compaction) +`Properties` | A key-value map (both keys and values must be strings) for any application-specific metadata attached to the message +`EventTime` | The timestamp associated with the message +`ReplicationClusters` | The clusters to which this message will be replicated. Pulsar brokers handle message replication automatically; you should only change this setting if you want to override the broker default. + +## TLS encryption and authentication + +In order to use [TLS encryption](administration-auth.md#), you'll need to configure your client to do so: + + * Use `pulsar+ssl` URL type + * Set `TLSTrustCertsFilePath` to the path to the TLS certs used by your client and the Pulsar broker + * Configure `Authentication` option + +Here's an example: + +```go +opts := pulsar.ClientOptions{ + URL: "pulsar+ssl://my-cluster.com:6651", + TLSTrustCertsFilePath: "/path/to/certs/my-cert.csr", + Authentication: NewAuthenticationTLS("my-cert.pem", "my-key.pem"), +} +``` diff --git a/site2/docs/client-libraries-java.md b/site2/docs/client-libraries-java.md new file mode 100644 index 0000000000000000000000000000000000000000..9fe37849fb0d5c903e0d8568e32a85475af3f802 --- /dev/null +++ b/site2/docs/client-libraries-java.md @@ -0,0 +1,468 @@ +--- +id: client-libraries-java +title: The Pulsar Java client +sidebar_label: Java +--- + +The Pulsar Java client can be used both to create Java producers, consumers, and [readers](#readers) of messages and to perform [administrative tasks](admin-api-overview.md). The current version of the Java client is **pulsar:version**. + +Javadoc for the Pulsar client is divided up into two domains, by package: + +Package | Description | Maven Artifact +:-------|:------------|:-------------- +[`org.apache.pulsar.client.api`](/api/client) | The producer and consumer API | [org.apache.pulsar:pulsar-client:pulsar:version](http://search.maven.org/#artifactdetails%7Corg.apache.pulsar%7Cpulsar-client%7Cpulsar:version%7Cjar) +[`org.apache.pulsar.client.admin`](/api/admin) | The Java [admin API](admin-api-overview.md) | [org.apache.pulsar:pulsar-client-admin:pulsar:version](http://search.maven.org/#artifactdetails%7Corg.apache.pulsar%7Cpulsar-client-admin%7Cpulsar:version%7Cjar) + +This document will focus only on the client API for producing and consuming messages on Pulsar topics. For a guide to using the Java admin client, see [The Pulsar admin interface](admin-api-overview.md). + +## Installation + +The latest version of the Pulsar Java client library is available via [Maven Central](http://search.maven.org/#artifactdetails%7Corg.apache.pulsar%7Cpulsar-client%7Cpulsar:version%7Cjar). To use the latest version, add the `pulsar-client` library to your build configuration. + +### Maven + +If you're using Maven, add this to your `pom.xml`: + +```xml + +pulsar:version + + + + org.apache.pulsar + pulsar-client + ${pulsar.version} + +``` + +### Gradle + +If you're using Gradle, add this to your `build.gradle` file: + +```groovy +def pulsarVersion = 'pulsar:version' + +dependencies { + compile group: 'org.apache.pulsar', name: 'pulsar-client', version: pulsarVersion +} +``` + +## Connection URLs + +To connect to Pulsar using client libraries, you need to specify a [Pulsar protocol](developing-binary-protocol.md) URL. + +Pulsar protocol URLs are assigned to specific clusters, use the `pulsar` scheme and have a default port of 6650. Here's an example for `localhost`: + +```http +pulsar://localhost:6650 +``` + +A URL for a production Pulsar cluster may look something like this: + +```http +pulsar://pulsar.us-west.example.com:6650 +``` + +If you're using [TLS](administration-auth.md#tls-client-auth) authentication, the URL will look like something like this: + +```http +pulsar+ssl://pulsar.us-west.example.com:6651 +``` + +## Client configuration + +You can instantiate a {% javadoc PulsarClient client org.apache.pulsar.client.api.PulsarClient %} object using just a URL for the target Pulsar {% popover cluster %}, like this: + +```java +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); +``` + +> #### Default broker URLs for standalone clusters +> If you're running a cluster in [standalone mode](getting-started-standalone.md), the broker will be available at the `pulsar://localhost:6650` URL by default. + +Check out the Javadoc for the {% javadoc PulsarClient client org.apache.pulsar.client.api.PulsarClient %} class for a full listing of configurable parameters. + +> In addition to client-level configuration, you can also apply [producer](#configuring-producers) and [consumer](#configuring-consumers) specific configuration, as you'll see in the sections below. + + +## Producers + +In Pulsar, producers write messages to topics. Once you've instantiated a {% javadoc PulsarClient client org.apache.pulsar.client.api.PulsarClient %} object (as in the section [above](#client-configuration)), you can create a {% javadoc Producer client org.apache.pulsar.client.api.Producer %} for a specific Pulsar {% popover topic %}. + +```java +Producer producer = client.newProducer() + .topic("my-topic") + .create(); + +// You can then send messages to the broker and topic you specified: +producer.send("My message".getBytes()); +``` + +By default, producers produce messages that consist of byte arrays. You can produce different types, however, by specifying a message [schema](#schemas). + +```java +Producer stringProducer = client.newProducer(Schema.STRING) + .topic("my-topic") + .create(); +stringProducer.send("My message"); +``` + +> You should always make sure to close your producers, consumers, and clients when they are no longer needed: +> ```java +> producer.close(); +> consumer.close(); +> client.close(); +> ``` +> +> Close operations can also be asynchronous: +> ```java +> producer.closeAsync() +> .thenRun(() -> System.out.println("Producer closed")); +> .exceptionally((ex) -> { +> System.err.println("Failed to close producer: " + ex); +> return ex; +> }); +> ``` +' %} + +### Configuring producers + +If you instantiate a `Producer` object specifying only a topic name, as in the example above, the producer will use the default configuration. To use a non-default configuration, there's a variety of configurable parameters that you can set. For a full listing, see the Javadoc for the {% javadoc ProducerBuilder client org.apache.pulsar.client.api.ProducerBuilder %} class. Here's an example: + +```java +Producer producer = client.newProducer() + .topic("my-topic") + .batchingMaxPublishDelay(10, TimeUnit.MILLISECONDS) + .sendTimeout(10, TimeUnit.SECONDS) + .blockIfQueueFull(true) + .create(); +``` + +### Message routing + +When using partitioned topics, you can specify the routing mode whenever you publish messages using a producer. For more on specifying a routing mode using the Java client, see the [Partitioned Topics](cookbooks-partitioned.md) cookbook. + +### Async send + +You can also publish messages [asynchronously](getting-started-concepts-and-architecture.md#send-modes) using the Java client. With async send, the producer will put the message in a blocking queue and return immediately. The client library will then send the message to the broker in the background. If the queue is full (max size configurable), the producer could be blocked or fail immediately when calling the API, depending on arguments passed to the producer. + +Here's an example async send operation: + +```java +producer.sendAsync("my-async-message".getBytes()).thenAccept(msgId -> { + System.out.printf("Message with ID %s successfully sent", msgId); +}); +``` + +As you can see from the example above, async send operations return a {% javadoc MessageId client org.apache.pulsar.client.api.MessageId %} wrapped in a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture). + +### Configuring messages + +In addition to a value, it's possible to set additional items on a given message: + +```java +producer.newMessage() + .key("my-message-key") + .value("my-async-message".getBytes()) + .property("my-key", "my-value") + .property("my-other-key", "my-other-value") + .send(); +``` + +As for the previous case, it's also possible to terminate the builder chain with `sendAsync()` and +get a future returned. + +## Consumers + +In Pulsar, consumers subscribe to topics and handle messages that producers publish to those topics. You can instantiate a new {% popover consumer %} by first instantiating a {% javadoc PulsarClient client org.apache.pulsar.client.api.PulsarClient %} object and passing it a URL for a Pulsar broker (as [above](#client-configuration)). + +Once you've instantiated a {% javadoc PulsarClient client org.apache.pulsar.client.api.PulsarClient %} object, you can create a {% javadoc Consumer client org.apache.pulsar.client.api.Consumer %} by specifying a {% popover topic %} and a [subscription](getting-started-concepts-and-architecture.md#subscription-modes). + +```java +Consumer consumer = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscribe(); +``` + +The `subscribe` method will automatically subscribe the consumer to the specified topic and subscription. One way to make the consumer listen on the topic is to set up a `while` loop. In this example loop, the consumer listens for messages, prints the contents of any message that's received, and then {% popover acknowledges %} that the message has been processed: + +```java +do { + // Wait for a message + Message msg = consumer.receive(); + + System.out.printf("Message received: %s", new String(msg.getData())); + + // Acknowledge the message so that it can be deleted by the message broker + consumer.acknowledge(msg); +} while (true); +``` + +### Configuring consumers + +If you instantiate a `Consumer` object specifying only a topic and subscription name, as in the example above, the consumer will use the default configuration. To use a non-default configuration, there's a variety of configurable parameters that you can set. For a full listing, see the Javadoc for the {% javadoc ConsumerBuilder client org.apache.pulsar.client.api.ConsumerBuilder %} class. Here's an example: + +Here's an example configuration: + +```java +Consumer consumer = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .ackTimeout(10, TimeUnit.SECONDS) + .subscriptionType(SubscriptionType.Exclusive) + .subscribe(); +``` + +### Async receive + +The `receive` method will receive messages synchronously (the consumer process will be blocked until a message is available). You can also use [async receive](getting-started-concepts-and-architecture.md#receive-modes), which will return immediately with a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture) object that completes once a new message is available. + +Here's an example: + +```java +CompletableFuture asyncMessage = consumer.receiveAsync(); +``` + +Async receive operations return a {% javadoc Message client org.apache.pulsar.client.api.Message %} wrapped inside of a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture). + +### Multi-topic subscriptions + +In addition to subscribing a consumer to a single Pulsar topic, you can also subscribe to multiple topics simultaneously using [multi-topic subscriptions](getting-started-concepts-and-architecture.md#multi-topic-subscriptions). To use multi-topic subscriptions you can supply either a regular expression (regex) or a `List` of topics. If you select topics via regex, all topics must be within the same Pulsar namespace. + +Here are some examples: + +```java +import org.apache.pulsar.client.api.Consumer; +import org.apache.pulsar.client.api.PulsarClient; + +import java.util.Arrays; +import java.util.List; +import java.util.regex.Pattern; + +ConsumerBuilder consumerBuilder = pulsarClient.newConsumer() + .subscriptionName(subscription); + +// Subscribe to all topics in a namespace +Pattern allTopicsInNamespace = Pattern.compile("persistent://public/default/.*"); +Consumer allTopicsConsumer = consumerBuilder + .topicsPattern(allTopicsInNamespace) + .subscribe(); + +// Subscribe to a subsets of topics in a namespace, based on regex +Pattern someTopicsInNamespace = Pattern.compile("persistent://public/default/foo.*"); +Consumer allTopicsConsumer = consumerBuilder + .topicsPattern(someTopicsInNamespace) + .subscribe(); +``` + +You can also subscribe to an explicit list of topics (across namespaces if you wish): + +```java +List topics = Arrays.asList( + "topic-1", + "topic-2", + "topic-3" +); + +Consumer multiTopicConsumer = consumerBuilder + .topics(topics) + .subscribe(); + +// Alternatively: +Consumer multiTopicConsumer = consumerBuilder + .topics( + "topic-1", + "topic-2", + "topic-3" + ) + .subscribe(); +``` + +You can also subscribe to multiple topics asynchronously using the `subscribeAsync` method rather than the synchronous `subscribe` method. Here's an example: + +```java +Pattern allTopicsInNamespace = Pattern.compile("persistent://public/default.*"); +consumerBuilder + .topics(topics) + .subscribeAsync() + .thenAccept(consumer -> { + do { + try { + Message msg = consumer.receive(); + // Do something with the received message + } catch (PulsarClientException e) { + e.printStackTrace(); + } + } while (true); + }); +``` + +## Reader interface {#readers} + +With the [reader interface](getting-started-concepts-and-architecture.md#reader-interface), Pulsar clients can "manually position" themselves within a topic, reading all messages from a specified message onward. The Pulsar API for Java enables you to create {% javadoc Reader client org.apache.pulsar.client.api.Reader %} objects by specifying a topic, a {% javadoc MessageId client org.apache.pulsar.client.api.MessageId %}, and {% javadoc ReaderConfiguration client org.apache.pulsar.client.api.ReaderConfiguration %}. + +Here's an example: + +```java +ReaderConfiguration conf = new ReaderConfiguration(); +byte[] msgIdBytes = // Some message ID byte array +MessageId id = MessageId.fromByteArray(msgIdBytes); +Reader reader = pulsarClient.newReader() + .topic(topic) + .startMessageId(id) + .create(); + +while (true) { + Message message = reader.readNext(); + // Process message +} +``` + +In the example above, a `Reader` object is instantiated for a specific topic and message (by ID); the reader then iterates over each message in the topic after the message identified by `msgIdBytes` (how that value is obtained depends on the application). + +The code sample above shows pointing the `Reader` object to a specific message (by ID), but you can also use `MessageId.earliest` to point to the earliest available message on the topic of `MessageId.latest` to point to the most recent available message. + +## Schemas + +In Pulsar, all message data consists of byte arrays "under the hood." [Message schemas](getting-started-concepts-and-architecture.md#schema-registry) enable you to use other types of data when constructing and handling messages (from simple types like strings to more complex, application-specific types). If you construct, say, a [producer](#producers) without specifying a schema, then the producer can only produce messages of type `byte[]`. Here's an example: + +```java +Producer producer = client.newProducer() + .topic(topic) + .create(); +``` + +The producer above is equivalent to a `Producer` (in fact, you should *always* explicitly specify the type). If you'd like to use a producer for a different type of data, you'll need to specify a **schema** that informs Pulsar which data type will be transmitted over the {% popover topic %}. + +### Schema example + +Let's say that you have a `SensorReading` class that you'd like to transmit over a Pulsar topic: + +```java +public class SensorReading { + public float temperature; + + public SensorReading(float temperature) { + this.temperature = temperature; + } + + // A no-arg constructor is required + public SensorReading() { + } + + public float getTemperature() { + return temperature; + } + + public void setTemperature(float temperature) { + this.temperature = temperature; + } +} +``` + +You could then create a `Producer` (or `Consumer`) like so: + +```java +Producer producer = client.newProducer(JSONSchema.of(SensorReading.class)) + .topic("sensor-readings") + .create(); +``` + +The following schema formats are currently available for Java: + +* No schema or the byte array schema (which can be applied using `Schema.BYTES`): + + ```java + Producer bytesProducer = client.newProducer(Schema.BYTES) + .topic("some-raw-bytes-topic") + .create(); + ``` + + Or, equivalently: + + ```java + Producer bytesProducer = client.newProducer() + .topic("some-raw-bytes-topic") + .create(); + ``` + +* `String` for normal UTF-8-encoded string data. This schema can be applied using `Schema.STRING`: + + ```java + Producer stringProducer = client.newProducer(Schema.STRING) + .topic("some-string-topic") + .create(); + ``` +* JSON schemas can be created for POJOs using the `JSONSchema` class. Here's an example: + + ```java + Schema pojoSchema = JSONSchema.of(MyPojo.class); + Producer pojoProducer = client.newProducer(pojoSchema) + .topic("some-pojo-topic") + .create(); + ``` + +## Authentication + +Pulsar currently supports two authentication schemes: [TLS](administration-auth.md#tls-client-auth) and [Athenz](administration-auth.md#athenz). The Pulsar Java client can be used with both. + +### TLS Authentication + +To use [TLS](administration-auth.md#tls-client-auth), you need to set TLS to `true` using the `setUseTls` method, point your Pulsar client to a TLS cert path, and provide paths to cert and key files. + +Here's an example configuration: + +```java +Map authParams = new HashMap<>(); +authParams.put("tlsCertFile", "/path/to/client-cert.pem"); +authParams.put("tlsKeyFile", "/path/to/client-key.pem"); + +Authentication tlsAuth = AuthenticationFactory + .create(AuthenticationTls.class.getName(), authParams); + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://my-broker.com:6651") + .enableTls(true) + .tlsTrustCertsFilePath("/path/to/cacert.pem") + .authentication(tlsAuth) + .build(); +``` + +### Athenz + +To use [Athenz](administration-auth.md#athenz) as an authentication provider, you need to [use TLS](#tls-authentication) and provide values for four parameters in a hash: + +* `tenantDomain` +* `tenantService` +* `providerDomain` +* `privateKey` + +You can also set an optional `keyId`. Here's an example configuration: + +```java +Map authParams = new HashMap<>(); +authParams.put("tenantDomain", "shopping"); // Tenant domain name +authParams.put("tenantService", "some_app"); // Tenant service name +authParams.put("providerDomain", "pulsar"); // Provider domain name +authParams.put("privateKey", "file:///path/to/private.pem"); // Tenant private key path +authParams.put("keyId", "v1"); // Key id for the tenant private key (optional, default: "0") + +Authentication athenzAuth = AuthenticationFactory + .create(AuthenticationAthenz.class.getName(), authParams); + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://my-broker.com:6651") + .enableTls(true) + .tlsTrustCertsFilePath("/path/to/cacert.pem") + .authentication(athenzAuth) + .build(); +``` + +> #### Supported pattern formats +> The `privateKey` parameter supports the following three pattern formats: +> * `file:///path/to/file` +> * `file:/path/to/file` +> * `data:application/x-pem-file;base64,` diff --git a/site2/docs/client-libraries-python.md b/site2/docs/client-libraries-python.md new file mode 100644 index 0000000000000000000000000000000000000000..ac4f1e2734d1ca48c4fe062b5851380e8456f075 --- /dev/null +++ b/site2/docs/client-libraries-python.md @@ -0,0 +1,94 @@ +--- +id: client-libraries-python +title: The Pulsar Python client +sidebar_label: Python +--- + +The Pulsar Python client library is a wrapper over the existing [C++ client library](client-libraries-cpp.md) and exposes all of the [same features](/api/cpp). You can find the code in the [`python` subdirectory](https://github.com/apache/incubator-pulsar/tree/master/pulsar-client-cpp/python) of the C++ client code. + +## Installation + +You can install the [`pulsar-client`](https://pypi.python.org/pypi/pulsar-client) library either via [PyPi](https://pypi.python.org/pypi), using [pip](#installation-using-pip), or by building the library from source. + +### Installation using pip + +To install the `pulsar-client` library as a pre-built package using the [pip](https://pip.pypa.io/en/stable/) package manager: + +```shell +$ pip install pulsar-client --upgrade +``` + +Installation via PyPi is available for the following Python versions: + +Platform | Supported Python versions +:--------|:------------------------- +MacOS 10.12 (Sierra) and 10.13 (High Sierra) | 2.7, 3.6 +Linux | 2.7, 3.3, 3.4, 3.5, 3.6 + +### Installing from source + +To install the `pulsar-client` library by building from source, follow [these instructions](client-libraries-cpp.md#compilation) and compile the Pulsar C++ client library. That will also build the Python binding for the library. + +To install the built Python bindings: + +```shell +$ git clone https://github.com/apache/pulsar +$ cd pulsar/pulsar-client-cpp/python +$ sudo python setup.py install +``` + +## API Reference + +The complete Python API reference is available at [api/python]({{site.baseUrl}}/api/python). + +## Examples + +Below you'll find a variety of Python code examples for the `pulsar-client` library. + +### Producer example + +This creates a Python producer for the `my-topic` topic and send 10 messages on that topic: + +```python +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') + +producer = client.create_producer('my-topic') + +for i in range(10): + producer.send(('Hello-%d' % i).encode('utf-8')) + +client.close() +``` + +### Consumer example + +This creates a consumer with the `my-subscription` subscription on the `my-topic` topic, listen for incoming messages, print the content and ID of messages that arrive, and acknowledge each message to the Pulsar broker: + +```python +consumer = client.subscribe('my-topic', 'my-subscription') + +while True: + msg = consumer.receive() + print("Received message '{}' id='{}'".format(msg.data(), msg.message_id())) + consumer.acknowledge(msg) + +client.close() +``` + +### Reader interface example + +You can use the Pulsar Python API to use the Pulsar [reader interface](getting-started-concepts-and-architecture.md#reader-interface). Here's an example: + +```python +# MessageId taken from a previously fetched message +msg_id = msg.message_id() + +reader = client.create_reader('my-topic', msg_id) + +while True: + msg = reader.receive() + print("Received message '{}' id='{}'".format(msg.data(), msg.message_id())) + # No acknowledgment +``` diff --git a/site2/docs/client-libraries-websocket.md b/site2/docs/client-libraries-websocket.md new file mode 100644 index 0000000000000000000000000000000000000000..c71dfca56ee1abe5ba036249c0b6e8ed57cc79fd --- /dev/null +++ b/site2/docs/client-libraries-websocket.md @@ -0,0 +1,409 @@ +--- +id: client-libraries-websocket +title: Pulsar's WebSocket API +sidebar_label: WebSocket API +--- + +Pulsar's [WebSocket](https://developer.mozilla.org/en-US/docs/Web/API/WebSockets_API) API is meant to provide a simple way to interact with Pulsar using languages that do not have an official [client library](getting-started-clients.md). Through WebSockets you can publish and consume messages and use all the features available in the [Java](client-libraries-java.md), [Python](client-libraries-python.md), and [C++](client-libraries-cpp.md) client libraries. + + +> You can use Pulsar's WebSocket API with any WebSocket client library. See examples for Python and Node.js [below](#client-examples). + +## Running the WebSocket service + +The standalone variant of Pulsar that we recommend using for [local development](getting-started-standalone.md) already has the WebSocket service enabled. + +In non-standalone mode, there are two ways to deploy the WebSocket service: + +* [embedded](#embedded-with-a-pulsar-broker) with a Pulsar broker +* as a [separate component](#as-a-separate-component) + +### Embedded with a Pulsar broker + +In this mode, the WebSocket service will run within the same HTTP service that's already running in the broker. To enable this mode, set the [`webSocketServiceEnabled`](reference-configuration.md#broker-webSocketServiceEnabled) parameter in the [`conf/broker.conf`](reference-configuration.md#broker) configuration file in your installation. + +```properties +webSocketServiceEnabled=true +``` + +### As a separate component + +In this mode, the WebSocket service will be run from a Pulsar {% popover broker %} as a separate service. Configuration for this mode is handled in the [`conf/websocket.conf`](reference-configuration.md#websocket) configuration file. You'll need to set *at least* the following parameters: + +* [`globalZookeeperServers`](reference-configuration.md#websocket-globalZookeeperServers) +* [`webServicePort`](reference-configuration.md#websocket-webServicePort) +* [`clusterName`](reference-configuration.md#websocket-clusterName) + +Here's an example: + +```properties +globalZookeeperServers=zk1:2181,zk2:2181,zk3:2181 +webServicePort=8080 +clusterName=my-cluster +``` + +### Starting the broker + +When the configuration is set, you can start the service using the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) tool: + +```shell +$ bin/pulsar-daemon start websocket +``` + +## API Reference + +Pulsar's WebSocket API offers three endpoints for [producing](#producer-endpoint) messages, [consuming](#consumer-endpoint) messages and [reading](#reader-endpoint) messages. + +All exchanges via the WebSocket API use JSON. + +### Producer endpoint + +The producer endpoint requires you to specify a tenant, namespace, and topic in the URL: + +```http +ws://broker-service-url:8080/ws/v2/producer/persistent/:tenant/:namespace/:topic +``` + +##### Query param + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`sendTimeoutMillis` | long | no | Send timeout (default: 30 secs) +`batchingEnabled` | boolean | no | Enable batching of messages (default: false) +`batchingMaxMessages` | int | no | Maximum number of messages permitted in a batch (default: 1000) +`maxPendingMessages` | int | no | Set the max size of the internal-queue holding the messages (default: 1000) +`batchingMaxPublishDelay` | long | no | Time period within which the messages will be batched (default: 10ms) +`messageRoutingMode` | string | no | Message [routing mode](https://pulsar.incubator.apache.org/api/client/index.html?org/apache/pulsar/client/api/ProducerConfiguration.MessageRoutingMode.html) for the partitioned producer: `SinglePartition`, `RoundRobinPartition` +`compressionType` | string | no | Compression [type](https://pulsar.incubator.apache.org/api/client/index.html?org/apache/pulsar/client/api/CompressionType.html): `LZ4`, `ZLIB` +`producerName` | string | no | Specify the name for the producer. Pulsar will enforce only one producer with same name can be publishing on a topic +`initialSequenceId` | long | no | Set the baseline for the sequence ids for messages published by the producer. +`hashingScheme` | string | no | [Hashing function](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.HashingScheme.html) to use when publishing on a partitioned topic: `JavaStringHash`, `Murmur3_32Hash` + + +#### Publishing a message + +```json +{ + "payload": "SGVsbG8gV29ybGQ=", + "properties": {"key1": "value1", "key2": "value2"}, + "context": "1" +} +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`payload` | string | yes | Base-64 encoded payload +`properties` | key-value pairs | no | Application-defined properties +`context` | string | no | Application-defined request identifier +`key` | string | no | For partitioned topics, decides which partition to use +`replicationClusters` | array | no | Restrict replication to this list of {% popover clusters %}, specified by name + + +##### Example success response + +```json +{ + "result": "ok", + "messageId": "CAAQAw==", + "context": "1" + } +``` +##### Example failure response + +```json + { + "result": "send-error:3", + "errorMsg": "Failed to de-serialize from JSON", + "context": "1" + } +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`result` | string | yes | `ok` if successful or an error message if unsuccessful +`messageId` | string | yes | Message ID assigned to the published message +`context` | string | no | Application-defined request identifier + + +### Consumer endpoint + +The consumer endpoint requires you to specify a tenant, namespace, and topic, as well as a subscription, in the URL: + +```http +ws://broker-service-url:8080/ws/v2/consumer/persistent/:tenant/:namespace/:topic/:subscription +``` + +##### Query param + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`ackTimeoutMillis` | long | no | Set the timeout for unacked messages (default: 0) +`subscriptionType` | string | no | [Subscription type](https://pulsar.incubator.apache.org/api/client/index.html?org/apache/pulsar/client/api/SubscriptionType.html): `Exclusive`, `Failover`, `Shared` +`receiverQueueSize` | int | no | Size of the consumer receive queue (default: 1000) +`consumerName` | string | no | Consumer name +`priorityLevel` | int | no | Define a [priority](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerConfiguration.html#setPriorityLevel-int-) for the consumer + +##### Receiving messages + +Server will push messages on the WebSocket session: + +```json +{ + "messageId": "CAAQAw==", + "payload": "SGVsbG8gV29ybGQ=", + "properties": {"key1": "value1", "key2": "value2"}, + "publishTime": "2016-08-30 16:45:57.785" +} +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`messageId` | string | yes | Message ID +`payload` | string | yes | Base-64 encoded payload +`publishTime` | string | yes | Publish timestamp +`properties` | key-value pairs | no | Application-defined properties +`key` | string | no | Original routing key set by producer + +#### Acknowledging the message + +Consumer needs to acknowledge the successful processing of the message to +have the Pulsar broker delete it. + +```json +{ + "messageId": "CAAQAw==" +} +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`messageId`| string | yes | Message ID of the processed message + + +### Reader endpoint + +The reader endpoint requires you to specify a tenant, namespace, and topic in the URL: + +```http +ws://broker-service-url:8080/ws/v2/reader/persistent/:tenant/:namespace/:topic +``` + +##### Query param + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`readerName` | string | no | Reader name +`receiverQueueSize` | int | no | Size of the consumer receive queue (default: 1000) +`messageId` | int or enum | no | Message ID to start from, `earliest` or `latest` (default: `latest`) + +##### Receiving messages + +Server will push messages on the WebSocket session: + +```json +{ + "messageId": "CAAQAw==", + "payload": "SGVsbG8gV29ybGQ=", + "properties": {"key1": "value1", "key2": "value2"}, + "publishTime": "2016-08-30 16:45:57.785" +} +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`messageId` | string | yes | Message ID +`payload` | string | yes | Base-64 encoded payload +`publishTime` | string | yes | Publish timestamp +`properties` | key-value pairs | no | Application-defined properties +`key` | string | no | Original routing key set by producer + +#### Acknowledging the message + +**In WebSocket**, Reader needs to acknowledge the successful processing of the message to +have the Pulsar WebSocket service update the number of pending messages. +If you don't send acknowledgements, Pulsar WebSocket service will stop sending messages after reaching the pendingMessages limit. + +```json +{ + "messageId": "CAAQAw==" +} +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`messageId`| string | yes | Message ID of the processed message + + +### Error codes + +In case of error the server will close the WebSocket session using the +following error codes: + +Error Code | Error Message +:----------|:------------- +1 | Failed to create producer +2 | Failed to subscribe +3 | Failed to deserialize from JSON +4 | Failed to serialize to JSON +5 | Failed to authenticate client +6 | Client is not authorized +7 | Invalid payload encoding +8 | Unknown error + +{% include admonition.html type='warning' content='The application is responsible for re-establishing a new WebSocket session after a backoff period.' %} + +## Client examples + +Below you'll find code examples for the Pulsar WebSocket API in [Python](#python) and [Node.js](#nodejs). + +### Python + +This example uses the [`websocket-client`](https://pypi.python.org/pypi/websocket-client) package. You can install it using [pip](https://pypi.python.org/pypi/pip): + +```shell +$ pip install websocket-client +``` + +You can also download it from [PyPI](https://pypi.python.org/pypi/websocket-client). + +#### Python producer + +Here's an example Python producer that sends a simple message to a Pulsar {% popover topic %}: + +```python +import websocket, base64, json + +TOPIC = 'ws://localhost:8080/ws/producer/persistent/public/default/my-topic' + +ws = websocket.create_connection(TOPIC) + +# Send one message as JSON +ws.send(json.dumps({ + 'payload' : base64.b64encode('Hello World'), + 'properties': { + 'key1' : 'value1', + 'key2' : 'value2' + }, + 'context' : 5 +})) + +response = json.loads(ws.recv()) +if response['result'] == 'ok': + print 'Message published successfully' +else: + print 'Failed to publish message:', response +ws.close() +``` + +#### Python consumer + +Here's an example Python consumer that listens on a Pulsar topic and prints the message ID whenever a message arrives: + +```python +import websocket, base64, json + +TOPIC = 'ws://localhost:8080/ws/v2/consumer/persistent/public/default/my-topic/my-sub' + +ws = websocket.create_connection(TOPIC) + +while True: + msg = json.loads(ws.recv()) + if not msg: break + + print "Received: {} - payload: {}".format(msg, base64.b64decode(msg['payload'])) + + # Acknowledge successful processing + ws.send(json.dumps({'messageId' : msg['messageId']})) + +ws.close() +``` + +#### Python reader + +Here's an example Python reader that listens on a Pulsar topic and prints the message ID whenever a message arrives: + +```python +import websocket, base64, json + +TOPIC = 'ws://localhost:8080/ws/v2/reader/persistent/public/default/my-topic' + +ws = websocket.create_connection(TOPIC) + +while True: + msg = json.loads(ws.recv()) + if not msg: break + + print "Received: {} - payload: {}".format(msg, base64.b64decode(msg['payload'])) + + # Acknowledge successful processing + ws.send(json.dumps({'messageId' : msg['messageId']})) + +ws.close() +``` + +### Node.js + +This example uses the [`ws`](https://websockets.github.io/ws/) package. You can install it using [npm](https://www.npmjs.com/): + +```shell +$ npm install ws +``` + +#### Node.js producer + +Here's an example Node.js producer that sends a simple message to a Pulsar topic: + +```javascript +var WebSocket = require('ws'), + topic = "ws://localhost:8080/ws/v2/producer/persistent/my-tenant/my-ns/my-topic1", + ws = new WebSocket(topic); + +var message = { + "payload" : new Buffer("Hello World").toString('base64'), + "properties": { + "key1" : "value1", + "key2" : "value2" + }, + "context" : "1" +}; + +ws.on('open', function() { + // Send one message + ws.send(JSON.stringify(message)); +}); + +ws.on('message', function(message) { + console.log('received ack: %s', message); +}); +``` + +#### Node.js consumer + +Here's an example Node.js consumer that listens on the same topic used by the producer above: + +```javascript +var WebSocket = require('ws'), + topic = "ws://localhost:8080/ws/v2/consumer/persistent/my-tenant/my-ns/my-topic1/my-sub", + ws = new WebSocket(topic); + +ws.on('message', function(message) { + var receiveMsg = JSON.parse(message); + console.log('Received: %s - payload: %s', message, new Buffer(receiveMsg.payload, 'base64').toString()); + var ackMsg = {"messageId" : receiveMsg.messageId}; + ws.send(JSON.stringify(ackMsg)); +}); +``` + +#### NodeJS reader +```javascript +var WebSocket = require('ws'), + topic = "ws://localhost:8080/ws/v2/reader/persistent/my-tenant/my-ns/my-topic1", + ws = new WebSocket(topic); + +ws.on('message', function(message) { + var receiveMsg = JSON.parse(message); + console.log('Received: %s - payload: %s', message, new Buffer(receiveMsg.payload, 'base64').toString()); + var ackMsg = {"messageId" : receiveMsg.messageId}; + ws.send(JSON.stringify(ackMsg)); +}); +``` diff --git a/site2/docs/cookbooks-compaction.md b/site2/docs/cookbooks-compaction.md new file mode 100644 index 0000000000000000000000000000000000000000..bf8fce8a34d1a4f16e88ca4ffa9589f5977443ac --- /dev/null +++ b/site2/docs/cookbooks-compaction.md @@ -0,0 +1,131 @@ +--- +id: cookbooks-compaction +title: Topic compaction +sidebar_label: Topic compaction +--- + +Pulsar's [topic compaction](getting-started-concepts-and-architecture.md#compaction) feature enables you to create **compacted** topics in which older, "obscured" entries are pruned from the topic, allowing for faster reads through the topic's history (which messages are deemed obscured/outdated/irrelevant will depend on your use case). + +To use compaction: + +* You need to give messages keys, as topic compaction in Pulsar takes place on a *per-key basis* (i.e. messages are compacted based on their key). For a stock ticker use case, the stock symbol---e.g. `AAPL` or `GOOG`---could serve as the key (more on this [below](#when)). Messages without keys will be left alone by the compaction process. +* Compaction can be configured to run [automatically](#automatic), or you can manually [trigger](#trigger) compaction using the Pulsar administrative API. +* Your consumers must be [configured](#config) to read from compacted topics ([Java consumers](#java), for example, have a `readCompacted` setting that must be set to `true`). If this configuration is not set, consumers will still be able to read from the non-compacted topic. + + +> Compaction only works on messages that have keys (as in the stock ticker example the stock symbol serves as the key for each message). Keys can thus be thought of as the axis along which compaction is applied. Messages that don't have keys are simply ignored by compaction. + +## When should I use compacted topics? {#when} + +The classic example of a topic that could benefit from compaction would be a stock ticker topic through which consumers can access up-to-date values for specific stocks. Imagine a scneario in which messages carrying stock value data use the stock symbol as the key (`GOOG`, `AAPL`, `TWTR`, etc.). Compacting this topic would give consumers on the topic two options: + +* They can read from the "original," non-compacted topic in case they need access to "historical" values, i.e. the entirety of the topic's messages. +* They can read from the compacted topic if they only want to see the most up-to-date messages. + +Thus, if you're using a Pulsar topic called `stock-values`, some consumers could have access to all messages in the topic (perhaps because they're performing some kind of number crunching of all values in the last hour) while the consumers used to power the real-time stock ticker only see the compacted topic (and thus aren't forced to process outdated messages). Which variant of the topic any given consumer pulls messages from is determined by the consumer's [configuration](#config). + +> One of the benefits of compaction in Pulsar is that you aren't forced to choose between compacted and non-compacted topics, as the compaction process leaves the original topic as-is and essentially adds an alternate topic. In other words, you can run compaction on a topic and consumers that need access to the non-compacted version of the topic will not be adversely affected. + + +## Configuring compaction to run automatically {#automatic} + +Tenant administrators can configure a policy for compaction at the namespace level. The policy specifies how large the topic backlog can grow before compaction is triggered. + +For example, to trigger compaction when the backlog reaches 100MB: + +```bash +$ bin/pulsar-admin namespaces set-compaction-threshold \ + --threshold 100M my-tenant/my-namespace +``` + +Configuring the compaction threshold on a namespace will apply to all topics within that namespace. + +## Triggering compaction manually {#trigger} + +In order to run compaction on a topic, you need to use the [`topics compact`](reference-pulsar-admin.md#topics-compact) command for the [`pulsar-admin`](reference-pulsar-admin.md) CLI tool. Here's an example: + +```bash +$ bin/pulsar-admin topics compact \ + persistent://my-tenant/my-namespace/my-topic +``` + +The `pulsar-admin` tool runs compaction via the Pulsar [REST API](reference-rest-api.md). To run compaction in its own dedicated process, i.e. *not* through the REST API, you can use the [`pulsar compact-topic`](reference-cli-tools.md#pulsar-compact-topic) command. Here's an example: + +```bash +$ bin/pulsar compact-topic \ + --topic persistent://my-tenant-namespace/my-topic +``` + +> Running compaction in its own process is recommended when you want to avoid interfering with the broker's performance. Broker performance should only be affected, however, when running compaction on topics with a large keyspace (i.e when there are many keys on the topic). The first phase of the compaction process keeps a copy of each key in the topic, which can create memory pressure as the number of keys grows. Using the `pulsar-admin topics compact` command to run compaction through the REST API should present no issues in the overwhelming majority of cases; using `pulsar compact-topic` should correspondingly be considered an edge case. + +The `pulsar compact-topic` command communicates with [ZooKeeper](https://zookeeper.apache.org) directly. In order to establish communication with ZooKeeper, though, the `pulsar` CLI tool will need to have a valid [broker configuration](reference-configuration.md#broker). You can either supply a proper configuration in `conf/broker.conf` or specify a non-default location for the configuration: + +```bash +$ bin/pulsar compact-topic \ + --broker-conf /path/to/broker.conf \ + --topic persistent://my-tenant/my-namespace/my-topic + +# If the configuration is in conf/broker.conf +$ bin/pulsar compact-topic \ + --topic persistent://my-tenant/my-namespace/my-topic +``` + +#### When should I trigger compaction? + +How often you [trigger compaction](#trigger) will vary widely based on the use case. If you want a compacted topic to be extremely speedy on read, then you should run compaction fairly frequently. + +## Consumer configuration {#config} + +Pulsar consumers and readers need to be configured to read from compacted topics. The sections below show you how to enable compacted topic reads for Pulsar's language clients. If the + + +> #### Java only +> Currently, only [Java](#java) clients can consume messages from compacted topics. + + +### Java + +In order to read from a compacted topic using a Java consumer, the `readCompacted` parameter must be set to `true`. Here's an example consumer for a compacted topic: + +```java +Consumer compactedTopicConsumer = client.newConsumer() + .topic("some-compacted-topic") + .readCompacted(true) + .subscribe(); +``` + +As mentioned above, topic compaction in Pulsar works on a *per-key basis*. That means that messages that you produce on compacted topics need to have keys (the content of the key will depend on your use case). Messages that don't have keys will be ignored by the compaction process. Here's an example Pulsar message with a key: + +```java +import org.apache.pulsar.client.api.Message; +import org.apache.pulsar.client.api.MessageBuilder; + +Message msg = MessageBuilder.create() + .setContent(someByteArray) + .setKey("some-key") + .build(); +``` + +The example below shows a message with a key being produced on a compacted Pulsar topic: + +```java +import org.apache.pulsar.client.api.Message; +import org.apache.pulsar.client.api.MessageBuilder; +import org.apache.pulsar.client.api.Producer; +import org.apache.pulsar.client.api.PulsarClient; + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); + +Producer compactedTopicProducer = client.newProducer() + .topic("some-compacted-topic") + .create(); + +Message msg = MessageBuilder.create() + .setContent(someByteArray) + .setKey("some-key") + .build(); + +compactedTopicProducer.send(msg); +``` diff --git a/site2/docs/cookbooks-deduplication.md b/site2/docs/cookbooks-deduplication.md new file mode 100644 index 0000000000000000000000000000000000000000..19abd48c6bbc0fc2bd0abad96f6a7ae0537c0372 --- /dev/null +++ b/site2/docs/cookbooks-deduplication.md @@ -0,0 +1,121 @@ +--- +id: cookbooks-deduplication +title: Message deduplication +sidebar_label: Managing message deduplication +--- + +**Message deduplication** is a feature of Pulsar that, when enabled, ensures that each message produced on Pulsar topics is persisted to disk *only once*, even if the message is produced more than once. Message deduplication essentially unburdens Pulsar applications of the responsibility of ensuring deduplication and instead handles it automatically on the server side. + +Using message deduplication in Pulsar involves making some [configuration changes](#configuration) to your Pulsar brokers as well as some minor changes to the behavior of Pulsar [clients](#clients). + +> For a more thorough theoretical explanation of message deduplication, see the [Concepts and Architecture](getting-started-concepts-and-architecture.md#message-deduplication) document. + + +## How it works + +Message deduplication can be enabled and disabled on a per-namespace basis. By default, it is *disabled* on all namespaces and can enabled in the following ways: + +* Using the [`pulsar-admin namespaces`](#enabling) interface +* As a broker-level [default](#default) for all namespaces + +## Configuration for message deduplication + +You can configure message deduplication in Pulsar using the [`broker.conf`](reference-configuration.md#broker) configuration file. The following deduplication-related parameters are available: + +Parameter | Description | Default +:---------|:------------|:------- +`brokerDeduplicationEnabled` | Sets the default behavior for message deduplication in the Pulsar {% popover broker %}. If set to `true`, message deduplication will be enabled by default on all namespaces; if set to `false` (the default), deduplication will have to be [enabled](#enabling) and [disabled](#disabling) on a per-namespace basis. | `false` +`brokerDeduplicationMaxNumberOfProducers` | The maximum number of producers for which information will be stored for deduplication purposes. | `10000` +`brokerDeduplicationEntriesInterval` | The number of entries after which a deduplication informational snapshot is taken. A larger interval will lead to fewer snapshots being taken, though this would also lengthen the topic recovery time (the time required for entries published after the snapshot to be replayed). | `1000` +`brokerDeduplicationProducerInactivityTimeoutMinutes` | The time of inactivity (in minutes) after which the broker will discard deduplication information related to a disconnected producer. | `360` (6 hours) + +### Setting the broker-level default {#default} + +By default, message deduplication is *disabled* on all Pulsar namespaces. To enable it by default on all namespaces, set the `brokerDeduplicationEnabled` parameter to `true` and re-start the broker. + +Regardless of the value of `brokerDeduplicationEnabled`, [enabling](#enabling) and [disabling](#disabling) via the CLI will override the broker-level default. + +### Enabling message deduplication {#enabling} + +You can enable message deduplication on specific namespaces, regardless of the the [default](#default) for the broker, using the [`pulsar-admin namespace set-deduplication`](reference-pulsar-admin.md#namespace-set-deduplication) command. You can use the `--enable`/`-e` flag and specify the namespace. Here's an example: + +```bash +$ bin/pulsar-admin namespaces set-deduplication \ + persistent://public/default/topic-1 \ + --enable # or just -e +``` + +### Disabling message deduplication {#disabling} + +You can disable message deduplication on a specific namespace using the same method shown [above](#enabling), except using the `--disable`/`-d` flag instead. Here's an example: + +```bash +$ bin/pulsar-admin namespaces set-deduplication \ + persistent://public/default/topic-1 \ + --disable # or just -d +``` + +## Message deduplication and Pulsar clients {#clients} + +If you enable message deduplication in your Pulsar brokers, you won't need to make any major changes to your Pulsar clients. There are, however, two settings that you need to provide for your client producers: + +1. The producer must be given a name +1. The message send timeout needs to be set to infinity (i.e. no timeout) + +Instructions for [Java](#java), [Python](#python), and [C++](#cpp) clients can be found below. + +### Java clients {#java} + +To enable message deduplication on a [Java producer](client-libraries-java.md#producers), set the producer name using the `producerName` setter and set the timeout to 0 using the `sendTimeout` setter. Here's an example: + +```java +import org.apache.pulsar.client.api.Producer; +import org.apache.pulsar.client.api.PulsarClient; +import java.util.concurrent.TimeUnit; + +PulsarClient pulsarClient = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); +Producer producer = pulsarClient.newProducer() + .producerName("producer-1") + .topic("persistent://public/default/topic-1") + .sendTimeout(0, TimeUnit.SECONDS) + .create(); +``` + +### Python clients {#python} + +To enable message deduplication on a [Python producer](client-libraries-python.md#producers), set the producer name using `producer_name` and the timeout to 0 using `send_timeout_millis`. Here's an example: + +```python +import pulsar + +client = pulsar.Client("pulsar://localhost:6650") +producer = client.create_producer( + "persistent://public/default/topic-1", + producer_name="producer-1", + send_timeout_millis=0) +``` + +### C++ clients {#cpp} + +To enable message deduplication on a [C++ producer](client-libraries-cpp.md#producer), set the producer name using `producer_name` and the timeout to 0 using `send_timeout_millis`. Here's an example: + +```cpp +#include + +std::string serviceUrl = "pulsar://localhost:6650"; +std::string topic = "persistent://some-tenant/ns1/topic-1"; +std::string producerName = "producer-1"; + +Client client(serviceUrl); + +ProducerConfiguration producerConfig; +producerConfig.setSendTimeout(0); +producerConfig.setProducerName(producerName); + +Producer producer; + +Result result = client.createProducer(topic, producerConfig, producer); +``` + diff --git a/site2/docs/cookbooks-encryption.md b/site2/docs/cookbooks-encryption.md new file mode 100644 index 0000000000000000000000000000000000000000..1bf22c9999b402849b9becfb9d0d020a5969b683 --- /dev/null +++ b/site2/docs/cookbooks-encryption.md @@ -0,0 +1,169 @@ +--- +id: cookbooks-encryption +title: Pulsar Encryption +sidebar_label: Encryption +--- + +Pulsar encryption allows applications to encrypt messages at the producer and decrypt at the consumer. Encryption is performed using the public/private key pair configured by the application. Encrypted messages can only be decrypted by consumers with a valid key. + +## Asymmetric and symmetric encryption + +Pulsar uses dynamically generated symmetric AES key to encrypt messages(data). The AES key(data key) is encrypted using application provided ECDSA/RSA key pair, as a result there is no need to share the secret with everyone. + +Key is a public/private key pair used for encryption/decryption. The producer key is the public key, and the consumer key is the private key of the key pair. + +The application configures the producer with the public key. This key is used to encrypt the AES data key. The encrypted data key is sent as part of message header. Only entities with the private key(in this case the consumer) will be able to decrypt the data key which is used to decrypt the message. + +A message can be encrypted with more than one key. Any one of the keys used for encrypting the message is sufficient to decrypt the message + +Pulsar does not store the encryption key anywhere in the pulsar service. If you lose/delete the private key, your message is irretrievably lost, and is unrecoverable + +## Producer +![alt text](/docs/assets/pulsar-encryption-producer.jpg "Pulsar Encryption Producer") + +## Consumer +![alt text](/docs/assets/pulsar-encryption-consumer.jpg "Pulsar Encryption Consumer") + +## Here are the steps to get started: + +1. Create your ECDSA or RSA public/private key pair. + +```shell +openssl ecparam -name secp521r1 -genkey -param_enc explicit -out test_ecdsa_privkey.pem +openssl ec -in test_ecdsa_privkey.pem -pubout -outform pkcs8 -out test_ecdsa_pubkey.pem +``` +2. Add the public and private key to the key management and configure your producers to retrieve public keys and consumers clients to retrieve private keys. +3. Implement CryptoKeyReader::getPublicKey() interface from producer and CryptoKeyReader::getPrivateKey() interface from consumer, which will be invoked by Pulsar client to load the key. +4. Add encryption key to producer configuration: conf.addEncryptionKey("myapp.key") +5. Add CryptoKeyReader implementation to producer/consumer config: conf.setCryptoKeyReader(keyReader) +6. Sample producer application: +```java +class RawFileKeyReader implements CryptoKeyReader { + + String publicKeyFile = ""; + String privateKeyFile = ""; + + RawFileKeyReader(String pubKeyFile, String privKeyFile) { + publicKeyFile = pubKeyFile; + privateKeyFile = privKeyFile; + } + + @Override + public EncryptionKeyInfo getPublicKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(publicKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read public key from file " + publicKeyFile); + e.printStackTrace(); + } + return keyInfo; + } + + @Override + public EncryptionKeyInfo getPrivateKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(privateKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read private key from file " + privateKeyFile); + e.printStackTrace(); + } + return keyInfo; + } +} +PulsarClient pulsarClient = PulsarClient.create("http://localhost:8080"); + +ProducerConfiguration prodConf = new ProducerConfiguration(); +prodConf.setCryptoKeyReader(new RawFileKeyReader("test_ecdsa_pubkey.pem", "test_ecdsa_privkey.pem")); +prodConf.addEncryptionKey("myappkey"); + +Producer producer = pulsarClient.createProducer("persistent://my-tenant/my-ns/my-topic", prodConf); + +for (int i = 0; i < 10; i++) { + producer.send("my-message".getBytes()); +} + +pulsarClient.close(); +``` +7. Sample Consumer Application: +```java +class RawFileKeyReader implements CryptoKeyReader { + + String publicKeyFile = ""; + String privateKeyFile = ""; + + RawFileKeyReader(String pubKeyFile, String privKeyFile) { + publicKeyFile = pubKeyFile; + privateKeyFile = privKeyFile; + } + + @Override + public EncryptionKeyInfo getPublicKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(publicKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read public key from file " + publicKeyFile); + e.printStackTrace(); + } + return keyInfo; + } + + @Override + public EncryptionKeyInfo getPrivateKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(privateKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read private key from file " + privateKeyFile); + e.printStackTrace(); + } + return keyInfo; + } +} + +ConsumerConfiguration consConf = new ConsumerConfiguration(); +consConf.setCryptoKeyReader(new RawFileKeyReader("test_ecdsa_pubkey.pem", "test_ecdsa_privkey.pem")); +PulsarClient pulsarClient = PulsarClient.create("http://localhost:8080"); +Consumer consumer = pulsarClient.subscribe("persistent://my-tenant//my-ns/my-topic", "my-subscriber-name", consConf); +Message msg = null; + +for (int i = 0; i < 10; i++) { + msg = consumer.receive(); + // do something + System.out.println("Received: " + new String(msg.getData())); +} + +// Acknowledge the consumption of all messages at once +consumer.acknowledgeCumulative(msg); +pulsarClient.close(); +``` + +## Key rotation +Pulsar generates new AES data key every 4 hours or after a certain number of messages are published. The asymmetric public key is automatically fetched by producer every 4 hours by calling CryptoKeyReader::getPublicKey() to retrieve the latest version. + +## Enabling encryption at the producer application: +If you produce messages that are consumed across application boundaries, you need to ensure that consumers in other applications have access to one of the private keys that can decrypt the messages. This can be done in two ways: +1. The consumer application provides you access to their public key, which you add to your producer keys +1. You grant access to one of the private keys from the pairs used by producer + +In some cases, the producer may want to encrypt the messages with multiple keys. For this, add all such keys to the config. Consumer will be able to decrypt the message, as long as it has access to at least one of the keys. + +E.g: If messages needs to be encrypted using 2 keys myapp.messagekey1 and myapp.messagekey2, +```java +conf.addEncryptionKey("myapp.messagekey1"); +conf.addEncryptionKey("myapp.messagekey2"); +``` +## Decrypting encrypted messages at the consumer application: +Consumers require access one of the private keys to decrypt messages produced by the producer. If you would like to receive encrypted messages, create a public/private key and give your public key to the producer application to encrypt messages using your public key. + +## Handling Failures: +* Producer/ Consumer loses access to the key + * Producer action will fail indicating the cause of the failure. Application has the option to proceed with sending unencrypted message in such cases. Call conf.setCryptoFailureAction(ProducerCryptoFailureAction) to control the producer behavior. The default behavior is to fail the request. + * If consumption failed due to decryption failure or missing keys in consumer, application has the option to consume the encrypted message or discard it. Call conf.setCryptoFailureAction(ConsumerCryptoFailureAction) to control the consumer behavior. The default behavior is to fail the request. +Application will never be able to decrypt the messages if the private key is permanently lost. +* Batch messaging + * If decryption fails and the message contain batch messages, client will not be able to retrieve individual messages in the batch, hence message consumption fails even if conf.setCryptoFailureAction() is set to CONSUME. +* If decryption fails, the message consumption stops and application will notice backlog growth in addition to decryption failure messages in the client log. If application does not have access to the private key to decrypt the message, the only option is to skip/discard backlogged messages. + diff --git a/site2/docs/cookbooks-message-queue.md b/site2/docs/cookbooks-message-queue.md new file mode 100644 index 0000000000000000000000000000000000000000..a4da5b4dbe32c40d0d5e5dccaf1034910f246eac --- /dev/null +++ b/site2/docs/cookbooks-message-queue.md @@ -0,0 +1,94 @@ +--- +id: cookbooks-message-queue +title: Using Pulsar as a message queue +sidebar_label: Message queue +--- + +Message queues are essential components of many large-scale data architectures. If every single work object that passes through your system absolutely *must* be processed in spite of the slowness or downright failure of this or that system component, there's a good chance that you'll need a message queue to step in and ensure that unprocessed data is retained---with correct ordering---until the required actions are taken. + +Pulsar is a great choice for a message queue because: + +* it was built with [persistent message storage](getting-started-concepts-and-architecture.md#persistent-storage) in mind +* it offers automatic load balancing across {% popover consumers %} for messages on a topic (or custom load balancing if you wish) + +> You can use the same Pulsar installation to act as a real-time message bus and as a message queue if you wish (or just one or the other). You can set aside some topics for real-time purposes and other topics for message queue purposes (or use specific namespaces for either purpose if you wish). + + +# Client configuration changes + +To use a Pulsar popover topic as a message queue, you should distribute the receiver load on that topic across several consumers (the optimal number of consumers will depend on the load). Each consumer must: + +* Establish a [shared subscription](getting-started-concepts-and-architecture.md#shared) and use the same subscription name as the other consumers (otherwise the subscription is not shared and the consumers can't act as a processing ensemble) +* If you'd like to have tight control over message dispatching across consumers, set the consumers' **receiver queue** size very low (potentially even to 0 if necessary). Each Pulsar {% popover consumer %} has a receiver queue that determines how many messages the consumer will attempt to fetch at a time. A receiver queue of 1000 (the default), for example, means that the consumer will attempt to process 1000 messages from the topic's backlog upon connection. Setting the receiver queue to zero essentially means ensuring that each consumer is only doing one thing at a time. + + The downside to restricting the receiver queue size of consumers is that that limits the potential throughput of those consumers and cannot be used with {% popover partitioned topics %}. Whether the performance/control trade-off is worthwhile will depend on your use case. + +## Java clients + +Here's an example Java consumer configuration that uses a shared subscription: + +```java +import org.apache.pulsar.client.api.Consumer; +import org.apache.pulsar.client.api.PulsarClient; +import org.apache.pulsar.client.api.SubscriptionType; + +String SERVICE_URL = "pulsar://localhost:6650"; +String TOPIC = "persistent://public/default/mq-topic-1"; +String subscription = "sub-1"; + +PulsarClient client = PulsarClient.builder() + .serviceUrl(SERVICE_URL) + .build(); + +Consumer consumer = client.newConsumer() + .topic(TOPIC) + .subscriptionName(subscription) + .subscriptionType(SubscriptionType.Shared) + // If you'd like to restrict the receiver queue size + .receiverQueueSize(10) + .subscribe(); +``` + +## Python clients + +Here's an example Python consumer configuration that uses a shared subscription: + +```python +from pulsar import Client, ConsumerType + +SERVICE_URL = "pulsar://localhost:6650" +TOPIC = "persistent://public/default/mq-topic-1" +SUBSCRIPTION = "sub-1" + +client = Client(SERVICE_URL) +consumer = client.subscribe( + TOPIC, + SUBSCRIPTION, + # If you'd like to restrict the receiver queue size + receiver_queue_size=10, + consumer_type=ConsumerType.Shared) +``` + +## C++ clients + +Here's an example C++ consumer configuration that uses a shared subscription: + +```cpp +#include + +std::string serviceUrl = "pulsar://localhost:6650"; +std::string topic = "persistent://public/defaultmq-topic-1"; +std::string subscription = "sub-1"; + +Client client(serviceUrl); + +ConsumerConfiguration consumerConfig; +consumerConfig.setConsumerType(ConsumerType.ConsumerShared); +// If you'd like to restrict the receiver queue size +consumerConfig.setReceiverQueueSize(10); + +Consumer consumer; + +Result result = client.subscribe(topic, subscription, consumerConfig, consumer); +``` + diff --git a/site2/docs/cookbooks-non-persistent.md b/site2/docs/cookbooks-non-persistent.md new file mode 100644 index 0000000000000000000000000000000000000000..64eb454c2d6b10d5bead502bc20617381a0d7c90 --- /dev/null +++ b/site2/docs/cookbooks-non-persistent.md @@ -0,0 +1,58 @@ +--- +id: cookbooks-non-persistent +title: Non-persistent messaging +sidebar_label: Non-persistent messaging +--- + +**Non-persistent topics** are Pulsar topics in which message data is *never* [persistently stored](getting-started-concepts-and-architecture.md#persistent-storage) and kept only in memory. This cookbook provides: + +* A basic [conceptual overview](#overview) of non-persistent topics +* Information about [configurable parameters](#configuration) related to non-persistent topics +* A guide to the [CLI interface](#cli) for managing non-persistent topics + +## Overview + +By default, Pulsar persistently stores *all* unacknowledged messages on multiple [BookKeeper](#persistent-storage) bookies (storage nodes). Data for messages on persistent topics can thus survive broker restarts and subscriber failover. + +Pulsar also, however, supports **non-persistent topics**, which are topics on which messages are *never* persisted to disk and live only in memory. When using non-persistent delivery, killing a Pulsar {% popover broker %} or disconnecting a subscriber to a topic means that all in-transit messages are lost on that (non-persistent) topic, meaning that clients may see message loss. + +Non-persistent topics have names of this form (note the `non-persistent` in the name): + +```http +non-persistent://tenant/namespace/topic +``` + +> For more high-level information about non-persistent topics, see the [Concepts and Architecture](getting-started-concepts-and-architecture.md#non-persistent-topics) documentation. + +## Using + +> In order to use non-persistent topics, they must be [enabled](#enabling) in your Pulsar broker configuration. + +In order to use non-persistent topics, you only need to differentiate them by name when interacting with them. This [`pulsar-client produce`](reference-cli-tools.md#pulsar-client-produce) command, for example, would produce one message on a non-persistent topic in a standalone cluster: + +```bash +$ bin/pulsar-client produce non-persistent://public/default/example-np-topic \ + --num-produce 1 \ + --messages "This message will be stored only in memory" +``` + +> For a more thorough guide to non-persistent topics from an administrative perspective, see the [Non-persistent topics](admin-api-non-persistent-topics.md) guide. + +## Enabling + +In order to enable non-persistent topics in a Pulsar broker, the [`enableNonPersistentTopics`](reference-configuration.md#broker-enableNonPersistentTopics) must be set to `true`. This is the default, and so you won't need to take any action to enable non-persistent messaging. + + +> #### Configuration for standalone mode +> If you're running Pulsar in standalone mode, the same configurable parameters are available but in the [`standalone.conf`](reference-configuration.md#standalone) configuration file. + +If you'd like to enable *only* non-persistent topics in a broker, you can set the [`enablePersistentTopics`](reference-configuration.md#broker-enablePersistentTopics) parameter to `false` and the `enableNonPersistentTopics` parameter to `true`. + +## Managing with cli + +Non-persistent topics can be managed using the [`pulsar-admin non-persistent`](reference-pulsar-admin.md#non-persistent) command-line interface. With that interface you can perform actions like [create a partitioned non-persistent topic](reference-pulsar-admin.md#non-persistent-create-partitioned-topic), get [stats](reference-pulsar-admin.md#non-persistent-stats) for a non-persistent topic, [list](reference-pulsar-admin.md) non-persistent topics under a namespace, and more. + +## Using with Pulsar clients + +You shouldn't need to make any changes to your Pulsar clients to use non-persistent messaging beyond making sure that you use proper [topic names](#using) with `non-persistent` as the topic type. + diff --git a/site2/docs/cookbooks-partitioned.md b/site2/docs/cookbooks-partitioned.md new file mode 100644 index 0000000000000000000000000000000000000000..34ecb3e1033b4ac86f4f300e72f4421dbc60c52a --- /dev/null +++ b/site2/docs/cookbooks-partitioned.md @@ -0,0 +1,85 @@ +--- +id: cookbooks-partitioned +title: Non-persistent messaging +sidebar_label: Partitioned Topics +--- + +By default, Pulsar topics are served by a single broker. Using only a single broker, however, limits a topic's maximum throughput. *Partitioned topics* are a special type of topic that can span multiple brokers and thus allow for much higher throughput. For an explanation of how partitioned topics work, see the [Concepts](#concepts) section below. + +You can [publish](#publishing-to-partitioned-topics) to partitioned topics using Pulsar's client libraries and you can [create and manage](#managing-partitioned-topics) partitioned topics using Pulsar's [admin API](admin-api-overview.md). + +## Publishing to partitioned topics + +When publishing to partitioned topics, the only difference from non-partitioned topics is that you need to specify a [routing mode](getting-started-concepts-and-architecture.md#routing-modes) when you create a new {% popover producer %}. Examples for [Java](#java) are below. + +### Java + +Publishing messages to partitioned topics in the Java client works much like [publishing to normal topics](client-libraries-java.md#using-producers). The difference is that you need to specify either one of the currently available message routers or a custom router. + +#### Routing mode + +You can specify the routing mode in the ProducerConfiguration object that you use to configure your producer. You have three options: + +* `SinglePartition` +* `RoundRobinPartition` +* `CustomPartition` + +Here's an example: + +```java +String pulsarBrokerRootUrl = "pulsar://localhost:6650"; +String topic = "persistent://my-tenant/my-namespace/my-topic"; + +PulsarClient client = PulsarClient.create(pulsarBrokerRootUrl); +ProducerConfiguration config = new ProducerConfiguration(); +config.setMessageRoutingMode(ProducerConfiguration.MessageRoutingMode.SinglePartition); +Producer producer = client.createProducer(topic, config); +producer.send("Partitioned topic message".getBytes()); +``` + +#### Custom message router + +To use a custom message router, you need to provide an implementation of the {% javadoc MessageRouter client org.apache.pulsar.client.api.MessageRouter %} interface, which has just one `choosePartition` method: + +```java +public interface MessageRouter extends Serializable { + int choosePartition(Message msg); +} +``` + +Here's a (not very useful!) router that routes every message to partition 10: + +```java +public class AlwaysTenRouter implements MessageRouter { + public int choosePartition(Message msg) { + return 10; + } +} +``` + +With that implementation in hand, you can send + +```java +String pulsarBrokerRootUrl = "pulsar://localhost:6650"; +String topic = "persistent://my-property/my-cluster-my-namespace/my-topic"; + +PulsarClient client = PulsarClient.create(pulsarBrokerRootUrl); +ProducerConfiguration config = new ProducerConfiguration(); +config.setMessageRouter(AlwaysTenRouter); +Producer producer = client.createProducer(topic, config); +producer.send("Partitioned topic message".getBytes()); +``` + + +## Pulsar admin setup + +{% include explanations/admin-setup.md %} + +## Managing partitioned topics + +{% include explanations/partitioned-topic-admin.md %} + +## Concepts + +{% include explanations/partitioned-topics.md %} + diff --git a/site2/docs/cookbooks-retention-expiry.md b/site2/docs/cookbooks-retention-expiry.md new file mode 100644 index 0000000000000000000000000000000000000000..3564c8cf9d43320127287a5dc592ed5656ba388b --- /dev/null +++ b/site2/docs/cookbooks-retention-expiry.md @@ -0,0 +1,319 @@ +--- +id: cookbooks-retention-expiry +title: Message retention and expiry +sidebar_label: Message retention and expiry +--- + +Pulsar brokers are responsible for handling messages that pass through Pulsar, including [persistent storage](getting-started-concepts-and-architecture.md#persistent-storage) of messages. By default, brokers: + +* immediately delete all messages that have been acknowledged on every subscription, and +* persistently store all unacknowledged messages in a [backlog](#backlog-quotas). + +In Pulsar, you can override both of these default behaviors, at the namespace level, in two ways: + +* You can persistently store messages that have already been consumed and acknowledged for a minimum time by setting [retention policies](#retention-policies). +* Messages that are not acknowledged within a specified timeframe, can be automatically marked as consumed, by specifying the [time to live](#time-to-live-ttl) (TTL). + +Pulsar's [admin interface](admin-api-overview.md) enables you to manage both retention policies and TTL at the namespace level (and thus within a specific tenant and either on a specific cluster or in the [`global`](getting-started-concepts-and-architecture.md#global-cluster) cluster). + + +> #### Retention and TTL are solving two different problems +> * Message retention: Keep the data for at least X hours (even if acknowledged) +> * Time-to-live: Discard data after some time (by automatically acknowledging) +> +> In most cases, applications will want to use either one or the other (or none). + + +## Retention policies + +By default, when a Pulsar message arrives at a broker it will be stored until it has been acknowledged by a consumer, at which point it will be deleted. You can override this behavior and retain even messages that have already been acknowledged by setting a *retention policy* on all the topics in a given namespace. When you set a retention policy you can set either a *size limit* or a *time limit*. + +When you set a size limit of, say, 10 gigabytes, then messages in all topics in the namespace, *even acknowledged messages*, will be retained until the size limit for the topic is reached; if you set a time limit of, say, 1 day, then messages for all topics in the namespace will be retained for 24 hours. + +It is also possible to set *infinite* retention time or size, by setting `-1` for either time or +size retention. + +### Defaults + +There are two configuration parameters that you can use to set {% popover instance %}-wide defaults for message retention: [`defaultRetentionTimeInMinutes=0`](reference-configuration.md#broker-defaultRetentionTimeInMinutes) and [`defaultRetentionSizeInMB=0`](reference-configuration.md#broker-defaultRetentionSizeInMB). + +Both of these parameters are in the [`broker.conf`](reference-configuration.md#broker) configuration file. + +### Set retention policy + +You can set a retention policy for a namespace by specifying the namespace as well as both a size limit *and* a time limit. + +#### pulsar-admin + +Use the [`set-retention`](reference-pulsar-admin.md#namespaces-set-retention) subcommand and specify a namespace, a size limit using the `-s`/`--size` flag, and a time limit using the `-t`/`--time` flag. + +##### Examples + +To set a size limit of 10 gigabytes and a time limit of 3 hours for the `my-tenant/my-ns` namespace: + +```shell +$ pulsar-admin namespaces set-retention my-tenant/my-ns \ + --size 10G \ + --time 3h +``` + +To set retention with infinite time and a size limit: + +```shell +$ pulsar-admin namespaces set-retention my-tenant/my-ns \ + --size 1T \ + --time -1 +``` + +Similarly, even the size can be to unlimited: + +```shell +$ pulsar-admin namespaces set-retention my-tenant/my-ns \ + --size -1 \ + --time -1 +``` + + + +#### REST API + +```http +POST /admin/v2/namespaces/:tenant/:namespace/retention +``` +[More info](reference-rest-api.md#/admin/namespaces/:property/:cluster/:namespace/retention) + +#### Java + +```java +int retentionTime = 10; // 10 minutes +int retentionSize = 500; // 500 megabytes +RetentionPolicies policies = new RetentionPolicies(retentionTime, retentionSize); +admin.namespaces().setRetention(namespace, policies); +``` + +### Get retention policy + +You can fetch the retention policy for a namespace by specifying the namespace. The output will be a JSON object with two keys: `retentionTimeInMinutes` and `retentionSizeInMB`. + +#### pulsar-admin + +Use the [`get-retention`](reference-pulsar-admin.md#namespaces) subcommand and specify the namespace. + +##### Example + +```shell +$ pulsar-admin namespaces get-retention my-tenant/my-ns +{ + "retentionTimeInMinutes": 10, + "retentionSizeInMB": 0 +} +``` + +#### REST API + +```http +GET /admin/v2/namespaces/:tenant/:namespace/retention +``` + +[More info](reference-rest-api.md#/admin/namespaces/:property/:cluster/:namespace/retention) + +#### Java + +```java +admin.namespaces().getRetention(namespace); +``` + +## Backlog quotas + +*Backlogs* are sets of unacknowledged messages for a topic that have been stored by bookies. Pulsar stores all unacknowledged messages in backlogs until they are processed and acknowledged. + +You can control the allowable size of backlogs, at the namespace level, using *backlog quotas*. Setting a backlog quota involves setting: + +* an allowable *size threshold* for each topic in the namespace +* a *retention policy* that determines which action the {% popover broker %} takes if the threshold is exceeded. + +The following retention policies are available: + +Policy | Action +:------|:------ +`producer_request_hold` | The broker will hold and not persist produce request payload +`producer_exception` | The broker will disconnect from the client by throwing an exception +`consumer_backlog_eviction` | The broker will begin discarding backlog messages + + +> #### Beware the distinction between retention policy types +> As you may have noticed, there are two definitions of the term "retention policy" in Pulsar, one that applies to persistent storage of already-acknowledged messages and one that applies to backlogs. + + +Backlog quotas are handled at the namespace level. They can be managed via: + +### Set size thresholds and backlog retention policies + +You can set a size threshold and backlog retention policy for all of the topics in a {% popover namespace %} by specifying the namespace, a size limit, and a policy by name. + +#### pulsar-admin + +Use the [`set-backlog-quota`](reference-pulsar-admin.md#namespaces) subcommand and specify a namespace, a size limit using the `-l`/`--limit` flag, and a retention policy using the `-p`/`--policy` flag. + +##### Example + +```shell +$ pulsar-admin namespaces set-backlog-quota my-tenant/my-ns \ + --limit 2G \ + --policy producer_request_hold +``` + +#### REST API + +```http +POST /admin/v2/namespaces/:tenant/:namespace/backlogQuota +``` + +[More info](reference-rest-api.md#/admin/namespaces/:property/:cluster/:namespace/backlogQuota) + +#### Java + +```java +long sizeLimit = 2147483648L; +BacklogQuota.RetentionPolicy policy = BacklogQuota.RetentionPolicy.producer_request_hold; +BacklogQuota quota = new BacklogQuota(sizeLimit, policy); +admin.namespaces().setBacklogQuota(namespace, quota); +``` + +### Get backlog threshold and backlog retention policy + +You can see which size threshold and backlog retention policy has been applied to a namespace. + +#### pulsar-admin + +Use the [`get-backlog-quotas`](reference-pulsar-admin.md#pulsar-admin-namespaces-get-backlog-quotas) subcommand and specify a namespace. Here's an example: + +```shell +$ pulsar-admin namespaces get-backlog-quotas my-tenant/my-ns +{ + "destination_storage": { + "limit" : 2147483648, + "policy" : "producer_request_hold" + } +} +``` + +#### REST API + +{% endpoint GET /admin/namespaces/:property/:cluster/:namespace/backlogQuotaMap %} + +```http +GET /admin/v2/namespaces/:tenant/:namespace/backlogQuotaMap +``` + +[More info](reference-rest-api.md#/admin/namespaces/:property/:cluster/:namespace/backlogQuota) + +#### Java + +```java +Map quotas = + admin.namespaces().getBacklogQuotas(namespace); +``` + +### Remove backlog quotas + +#### pulsar-admin + +Use the [`remove-backlog-quotas`](reference-pulsar-admin.md#pulsar-admin-namespaces-remove-backlog-quota) subcommand and specify a namespace. Here's an example: + +```shell +$ pulsar-admin namespaces remove-backlog-quotas my-tenant/my-ns +``` + +#### REST API + +```http +DELETE /admin/v2/namespaces/:tenant/:namespace/backlogQuota +``` + + +[More info](reference-rest-api.md#/admin/namespaces/:property/:cluster/:namespace/backlogQuota) + +#### Java + +```java +admin.namespaces().removeBacklogQuota(namespace); +``` + +### Clear backlog + +#### pulsar-admin + +Use the [`clear-backlog`](reference-pulsar-admin.md#pulsar-admin-namespaces-clear-backlog) subcommand. + +##### Example + +```shell +$ pulsar-admin namespaces clear-backlog my-tenant/my-ns +``` + +By default, you will be prompted to ensure that you really want to clear the backlog for the namespace. You can override the prompt using the `-f`/`--force` flag. + +## Time to live (TTL) + +By default, Pulsar stores all unacknowledged messages forever. This can lead to heavy disk space usage in cases where a lot of messages are going unacknowledged. If disk space is a concern, you can set a time to live (TTL) that determines how long unacknowledged messages will be retained. + +### Set the TTL for a namespace + +#### pulsar-admin + +Use the [`set-message-ttl`](reference-pulsar-admin.md#pulsar-admin-namespaces-set-message-ttl) subcommand and specify a namespace and a TTL (in seconds) using the `-ttl`/`--messageTTL` flag. + +##### Example + +```shell +$ pulsar-admin namespaces set-message-ttl my-tenant/my-ns \ + --messageTTL 120 # TTL of 2 minutes +``` + +#### REST API + +{% endpoint POST /admin/namespaces/:property/:cluster/:namespace/messageTTL %} + +```http +POST /admin/v2/namespaces/:tenant/:namespace/messageTTL +``` + +[More info](reference-rest-api.md#/admin/namespaces/:property/:cluster/:namespace/messageTTL) + +#### Java + +```java +admin.namespaces().setNamespaceMessageTTL(namespace, ttlInSeconds); +``` + +### Get the TTL configuration for a namespace + +#### pulsar-admin + +Use the [`get-message-ttl`](reference-pulsar-admin.md#pulsar-admin-namespaces-get-message-ttl) subcommand and specify a namespace. + +##### Example + +```shell +$ pulsar-admin namespaces get-message-ttl my-tenant/my-ns +60 +``` + +#### REST API + +{% endpoint GET /admin/namespaces/:property/:cluster/:namespace/messageTTL %} + +```http +GET /admin/v2/namespaces/:tenant/:namespace/messageTTL +``` + + +[More info](reference-rest-api.md#/admin/namespaces/:property/:cluster/:namespace/messageTTL) + +#### Java + +```java +admin.namespaces().get +``` + diff --git a/site2/docs/cookbooks-tiered-storage.md b/site2/docs/cookbooks-tiered-storage.md new file mode 100644 index 0000000000000000000000000000000000000000..792592e4a2b0b9b13bcae0564fdd8988390c0efe --- /dev/null +++ b/site2/docs/cookbooks-tiered-storage.md @@ -0,0 +1,137 @@ +--- +id: cookbooks-tiered-storage +title: Tiered Storage +sidebar_label: Tiered Storage +--- + +lsar's **Tiered Storage** feature allows older backlog data to be offloaded to long term storage, thereby freeing up space in BookKeeper and reducing storage costs. This cookbook walks you through using tiered storage in your Pulsar cluster. + +## When should I use Tiered Storage? + +Tiered storage should be used when you have a topic for which you want to keep a very long backlog for a long time. For example, if you have a topic containing user actions which you use to train your recommendation systems, you may want to keep that data for a long time, so that if you change your recommendation algorithm you can rerun it against your full user history. + +## The offloading mechanism + +A topic in Pulsar is backed by a log, known as a managed ledger. This log is composed of an ordered list of segments. Pulsar only every writes to the final segment of the log. All previous segments are sealed. The data within the segment is immutable. This is known as a segment oriented architecture. + +![Tiered storage](/docs/assets/pulsar-tiered-storage.png "Tiered Storage") + +{% include figure.html src="/img/pulsar-tiered-storage.png" alt="Tiered Storage" width="80" %} + +The Tiered Storage offloading mechanism takes advantage of this segment oriented architecture. When offloading is requested, the segments of the log are copied, one-by-one, to tiered storage. All segments of the log, apart from the segment currently being written to can be offloaded. + +## Amazon S3 + +Tiered storage currently supports S3 for long term storage. On the broker, the administrator must configure a S3 bucket and the AWS region where the bucket exists. Offloaded data will be placed into this bucket. + +The configured S3 bucket must exist before attempting to offload. If it does not exist, the offload operation will fail. + +Pulsar users multipart objects to update the segment data. It is possible that a broker could crash while uploading the data. We recommend you add a lifecycle rule your S3 bucket to expire incomplete multipart upload after a day or two to avoid getting charged for incomplete uploads. + +### Configuring the broker + +Offloading is configured in ```broker.conf```. + +At a minimum, the user must configure the driver, the region and the bucket. + +```conf +managedLedgerOffloadDriver=S3 +s3ManagedLedgerOffloadRegion=eu-west-3 +s3ManagedLedgerOffloadBucket=pulsar-topic-offload +``` + +It is also possible to specify the s3 endpoint directly, using `s3ManagedLedgerOffloadServiceEndpoint`. This is useful if you are using a non-AWS storage service which provides an S3 compatible API. + +> If the endpoint is specified directly, then the region must _not_ be set. + +> The broker.conf of all brokers must have the same configuration for driver, region and bucket for offload to avoid data becoming unavailable as topics move from one broker to another. + +Pulsar also provides some knobs to configure the size of requests sent to S3. + +- `s3ManagedLedgerOffloadMaxBlockSizeInBytes` configures the maximum size of a "part" sent during a multipart upload. This cannot be smaller than 5MB. Default is 64MB. +- `s3ManagedLedgerOffloadReadBufferSizeInBytes` configures the block size for each individual read when reading back data from S3. Default is 1MB. + +In both cases, these should not be touched unless you know what you are doing. + +> The broker must be rebooted for any changes in the configuration to take effect. + +### Authenticating with S3 + +To be able to access S3, you need to authenticate with S3. Pulsar does not provide any direct means of configuring authentication for S3, but relies on the mechanisms supported by the [DefaultAWSCredentialsProviderChain](https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/auth/DefaultAWSCredentialsProviderChain.html). + +Once you have created a set of credentials in the AWS IAM console, they can be configured in a number of ways. + +1. Set the environment variables **AWS_ACCESS_KEY_ID** and **AWS_SECRET_ACCESS_KEY** in ```conf/pulsar_env.sh```. + +```bash +export AWS_ACCESS_KEY_ID=ABC123456789 +export AWS_SECRET_ACCESS_KEY=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c +``` + +> \"export\" is important so that the variables are made available in the environment of spawned processes. + + +2. Add the Java system properties *aws.accessKeyId* and *aws.secretKey* to **PULSAR_EXTRA_OPTS** in `conf/pulsar_env.sh`. + +```bash +PULSAR_EXTRA_OPTS="${PULSAR_EXTRA_OPTS} ${PULSAR_MEM} ${PULSAR_GC} -Daws.accessKeyId=ABC123456789 -Daws.secretKey=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c -Dio.netty.leakDetectionLevel=disabled -Dio.netty.recycler.maxCapacity.default=1000 -Dio.netty.recycler.linkCapacity=1024" +``` + +3. Set the access credentials in ```~/.aws/credentials```. + +```conf +[default] +aws_access_key_id=ABC123456789 +aws_secret_access_key=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c +``` + +If you are running in EC2 you can also use instance profile credentials, provided through the EC2 metadata service, but that is out of scope for this cookbook. + +> The broker must be rebooted for credentials specified in pulsar_env to take effect. + +## Configuring offload to run automatically + +Namespace policies can be configured to offload data automatically once a threshold is reached. The threshold is based on the size of data that the topic has stored on the pulsar cluster. Once the topic reaches the threshold, an offload operation will be triggered. Setting a negative value to the threshold will disable automatic offloading. Setting the threshold to 0 will cause the broker to offload data as soon as it possiby can. + +```bash +$ bin/pulsar-admin namespaces set-offload-threshold --size 10M my-tenant/my-namespace +``` + +> Automatic offload runs when a new segment is added to a topic log. If you set the threshold on a namespace, but few messages are being produced to the topic, offload will not until the current segment is full. + + +## Triggering offload manually + +Offloading can manually triggered through a REST endpoint on the Pulsar broker. We provide a CLI which will call this rest endpoint for you. + +When triggering offload, you must specify the maximum size, in bytes, of backlog which will be retained locally on the bookkeeper. The offload mechanism will offload segments from the start of the topic backlog until this condition is met. + +```bash +$ bin/pulsar-admin topics offload --size-threshold 10M my-tenant/my-namespace/topic1 +Offload triggered for persistent://my-tenant/my-namespace/topic1 for messages before 2:0:-1 +``` + +The command to triggers an offload will not wait until the offload operation has completed. To check the status of the offload, use offload-status. + +```bash +$ bin/pulsar-admin topics offload-status my-tenant/my-namespace/topic1 +Offload is currently running +``` + +To wait for offload to complete, add the -w flag. + +```bash +$ bin/pulsar-admin topics offload-status -w my-tenant/my-namespace/topic1 +Offload was a success +``` + +If there is an error offloading, the error will be propagated to the offload-status command. + +```bash +$ bin/pulsar-admin topics offload-status persistent://public/default/topic1 +Error in offload +null + +Reason: Error offloading: org.apache.bookkeeper.mledger.ManagedLedgerException: java.util.concurrent.CompletionException: com.amazonaws.services.s3.model.AmazonS3Exception: Anonymous users cannot initiate multipart uploads. Please authenticate. (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: 798758DE3F1776DF; S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g=), S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g= +```` + diff --git a/site2/docs/deploy-aws.md b/site2/docs/deploy-aws.md new file mode 100644 index 0000000000000000000000000000000000000000..f533e99b7d37c49b590b6aa2334c8035b40f758d --- /dev/null +++ b/site2/docs/deploy-aws.md @@ -0,0 +1,185 @@ +--- +id: deploy-aws +title: Deploying a Pulsar cluster on AWS using Terraform and Ansible +sidebar_label: Amazon Web Services +--- + +> For instructions on deploying a single Pulsar cluster manually rather than using Terraform and Ansible, see [Deploying a Pulsar cluster on bare metal](deploy-bare-metal.md). For instructions on manually deploying a multi-cluster Pulsar instance, see [Deploying a Pulsar instance on bare metal](deploy-bare-metal-multi-cluster.md). + +One of the easiest ways to get a Pulsar {% popover cluster %} running on [Amazon Web Services](https://aws.amazon.com/) (AWS) is to use the the [Terraform](https://terraform.io) infrastructure provisioning tool and the [Ansible](https://www.ansible.com) server automation tool. Terraform can create the resources necessary to run the Pulsar cluster---[EC2](https://aws.amazon.com/ec2/) instances, networking and security infrastructure, etc.---while Ansible can install and run Pulsar on the provisioned resources. + +## Requirements and setup + +In order install a Pulsar cluster on AWS using Terraform and Ansible, you'll need: + +* An [AWS account](https://aws.amazon.com/account/) and the [`aws`](https://aws.amazon.com/cli/) command-line tool +* Python and [pip](https://pip.pypa.io/en/stable/) +* The [`terraform-inventory`](https://github.com/adammck/terraform-inventory) tool, which enables Ansible to use Terraform artifacts + +You'll also need to make sure that you're currently logged into your AWS account via the `aws` tool: + +```bash +$ aws configure +``` + +## Installation + +You can install Ansible on Linux or macOS using pip. + +```bash +$ pip install ansible +``` + +You can install Terraform using the instructions [here](https://www.terraform.io/intro/getting-started/install.html). + +You'll also need to have the Terraform and Ansible configurations for Pulsar locally on your machine. They're contained in Pulsar's [GitHub repository](https://github.com/apache/incubator-pulsar), which you can fetch using Git: + +```bash +$ git clone https://github.com/apache/incubator-pulsar +$ cd incubator-pulsar/deployment/terraform-ansible/aws +``` + +## SSH setup + +In order to create the necessary AWS resources using Terraform, you'll need to create an SSH key. To create a private SSH key in `~/.ssh/id_rsa` and a public key in `~/.ssh/id_rsa.pub`: + +```bash +$ ssh-keygen -t rsa +``` + +Do *not* enter a passphrase (hit **Enter** when prompted instead). To verify that a key has been created: + +```bash +$ ls ~/.ssh +id_rsa id_rsa.pub +``` + +## Creating AWS resources using Terraform + +To get started building AWS resources with Terraform, you'll need to install all Terraform dependencies: + +```bash +$ terraform init +# This will create a .terraform folder +``` + +Once you've done that, you can apply the default Terraform configuration: + +```bash +$ terraform apply +``` + +You should then see this prompt: + +```bash +Do you want to perform these actions? + Terraform will perform the actions described above. + Only 'yes' will be accepted to approve. + + Enter a value: +``` + +Type `yes` and hit **Enter**. Applying the configuration could take several minutes. When it's finished, you should see `Apply complete!` along with some other information, including the number of resources created. + +### Applying a non-default configuration + +You can apply a non-default Terraform configuration by changing the values in the `terraform.tfvars` file. The following variables are available: + +Variable name | Description | Default +:-------------|:------------|:------- +`public_key_path` | The path of the public key that you've generated. | `~/.ssh/id_rsa.pub` +`region` | The AWS region in which the Pulsar cluster will run | `us-west-2` +`availability_zone` | The AWS availability zone in which the Pulsar cluster will run | `us-west-2a` +`aws_ami` | The [Amazon Machine Image](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AMIs.html) (AMI) that will be used by the cluster | `ami-9fa343e7` +`num_zookeeper_nodes` | The number of [ZooKeeper](https://zookeeper.apache.org) nodes in the ZooKeeper cluster | 3 +`num_pulsar_brokers` | The number of Pulsar brokers and BookKeeper bookies that will run in the cluster | 3 +`base_cidr_block` | The root [CIDR](https://en.wikipedia.org/wiki/Classless_Inter-Domain_Routing) that will be used by network assets for the cluster | `10.0.0.0/16` +`instance_types` | The EC2 instance types to be used. This variable is a map with two keys: `zookeeper` for the ZooKeeper instances and `pulsar` for the Pulsar brokers and BookKeeper bookies | `t2.small` (ZooKeeper) and `i3.xlarge` (Pulsar/BookKeeper) + +### What is installed + +When you run the Ansible playbook, the following AWS resources will be used: + +* 6 total [Elastic Compute Cloud](https://aws.amazon.com/ec2) (EC2) instances running the [ami-9fa343e7](https://access.redhat.com/articles/3135091) Amazon Machine Image (AMI), which runs [Red Hat Enterprise Linux (RHEL) 7.4](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html-single/7.4_release_notes/index). By default, that includes: + * 3 small VMs for ZooKeeper ([t2.small](https://www.ec2instances.info/?selected=t2.small) instances) + * 3 larger VMs for Pulsar {% popover brokers %} and BookKeeper {% popover bookies %} ([i3.4xlarge](https://www.ec2instances.info/?selected=i3.4xlarge) instances) +* An EC2 [security group](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-network-security.html) +* A [virtual private cloud](https://aws.amazon.com/vpc/) (VPC) for security +* An [API Gateway](https://aws.amazon.com/api-gateway/) for connections from the outside world +* A [route table](http://docs.aws.amazon.com/AmazonVPC/latest/UserGuide/VPC_Route_Tables.html) for the Pulsar cluster's VPC +* A [subnet](http://docs.aws.amazon.com/AmazonVPC/latest/UserGuide/VPC_Subnets.html) for the VPC + +All EC2 instances for the cluster will run in the [us-west-2](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html) region. + +### Fetching your Pulsar connection URL + +When you apply the Terraform configuration by running `terraform apply`, Terraform will output a value for the `pulsar_service_url`. It should look something like this: + +``` +pulsar://pulsar-elb-1800761694.us-west-2.elb.amazonaws.com:6650 +``` + +You can fetch that value at any time by running `terraform output pulsar_service_url` or parsing the `terraform.tstate` file (which is JSON, even though the filename doesn't reflect that): + +```bash +$ cat terraform.tfstate | jq .modules[0].outputs.pulsar_service_url.value +``` + +### Destroying your cluster + +At any point, you can destroy all AWS resources associated with your cluster using Terraform's `destroy` command: + +```bash +$ terraform destroy +``` + +## Running the Pulsar playbook + +Once you've created the necessary AWS resources using Terraform, you can install and run Pulsar on the Terraform-created EC2 instances using Ansible. To do so, use this command: + +```bash +$ ansible-playbook \ + --user='ec2-user' \ + --inventory=`which terraform-inventory` \ + ../deploy-pulsar.yaml +``` + +If you've created a private SSH key at a location different from `~/.ssh/id_rsa`, you can specify the different location using the `--private-key` flag: + +```bash +$ ansible-playbook \ + --user='ec2-user' \ + --inventory=`which terraform-inventory` \ + --private-key="~/.ssh/some-non-default-key" \ + ../deploy-pulsar.yaml +``` + +## Accessing the cluster + +You can now access your running Pulsar using the unique Pulsar connection URL for your cluster, which you can obtain using the instructions [above](#fetching-your-pulsar-connection-url). + +For a quick demonstration of accessing the cluster, we can use the Python client for Pulsar and the Python shell. First, install the Pulsar Python module using pip: + +```bash +$ pip install pulsar-client +``` + +Now, open up the Python shell using the `python` command: + +```bash +$ python +``` + +Once in the shell, run the following: + +```python +>>> import pulsar +>>> client = pulsar.Client('pulsar://pulsar-elb-1800761694.us-west-2.elb.amazonaws.com:6650') +# Make sure to use your connection URL +>>> producer = client.create_producer('persistent://public/default/test-topic') +>>> producer.send('Hello world') +>>> client.close() +``` + +If all of these commands are successful, your cluster can now be used by Pulsar clients! + diff --git a/site2/docs/deploy-bare-metal-multi-cluster.md b/site2/docs/deploy-bare-metal-multi-cluster.md new file mode 100644 index 0000000000000000000000000000000000000000..177740ccb63e3f4d6b14d0a9337c111541039df7 --- /dev/null +++ b/site2/docs/deploy-bare-metal-multi-cluster.md @@ -0,0 +1,399 @@ +--- +id: deploy-bare-metal-multi-cluster +title: Deploying a multi-cluster on bare metal +sidebar_label: Bare metal multi-cluster +--- + +> Single-cluster Pulsar installations should be sufficient for all but the most ambitious use cases. If you're interested in experimenting with Pulsar or using it in a startup or on a single team, we recommend opting for a single cluster. For instructions on deploying a single cluster, see the guide [here](deploy-bare-metal.md). + +A Pulsar *instance* consists of multiple Pulsar clusters working in unison. Clusters can be distributed across data centers or geographical regions and can replicate amongst themselves using [geo-replication](administration-geo.md). Deploying a multi-cluster Pulsar instance involves the following basic steps: + +* Deploying two separate [ZooKeeper](#deploying-zookeeper) quorums: a [local](#deploying-local-zookeeper) quorum for each cluster in the instance and a [configuration store](#configuration-store) quorum for instance-wide tasks +* Initializing [cluster metadata](#cluster-metadata-initialization) for each cluster +* Deploying a [BookKeeper cluster](#deploying-bookkeeper) of bookies in each Pulsar cluster +* Deploying [brokers](#deploying-brokers) in each Pulsar cluster + +If you're deploying a single Pulsar cluster, see the [Clusters and Brokers](getting-started-standalone.md#starting-the-cluster) guide. + +> #### Running Pulsar locally or on Kubernetes? +> This guide shows you how to deploy Pulsar in production in a non-Kubernetes. If you'd like to run a standalone Pulsar cluster on a single machine for development purposes, see the [Setting up a local cluster](getting-started-standalone.md) guide. If you're looking to run Pulsar on [Kubernetes](https://kubernetes.io), see the [Pulsar on Kubernetes](deploy-kubernetes.md) guide, which includes sections on running Pulsar on Kubernetes on [Google Kubernetes Engine](deploy-kubernetes#pulsar-on-google-kubernetes-engine) and on [Amazon Web Services](deploy-kubernetes#pulsar-on-amazon-web-services). + +## System requirement +Pulsar is currently available for **MacOS** and **Linux**. In order to use Pulsar, you'll need to install [Java 8](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html). + +## Installing Pulsar + +To get started running Pulsar, download a binary tarball release in one of the following ways: + +* by clicking the link below and downloading the release from an Apache mirror: + + * Pulsar pulsar:version binary release + +* from the Pulsar [downloads page](pulsar:download_page_url) +* from the Pulsar [releases page](https://github.com/apache/incubator-pulsar/releases/latest) +* using [wget](https://www.gnu.org/software/wget): + + ```shell + $ wget 'https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=incubator/pulsar/pulsar-pulsar:version/apache-pulsar-pulsar:version-bin.tar.gz' -O apache-pulsar-pulsar:version-bin.tar.gz + ``` + +Once the tarball is downloaded, untar it and `cd` into the resulting directory: + +```bash +$ tar xvfz apache-pulsar-pulsar:version-bin.tar.gz +$ cd apache-pulsar-pulsar:version +``` + +## What your package contains + +The Pulsar binary package initially contains the following directories: + +Directory | Contains +:---------|:-------- +`bin` | Pulsar's [command-line tools](reference-cli-tools.md), such as [`pulsar`](reference-cli-tools.md#pulsar) and [`pulsar-admin`](reference-pulsar-admin.md) +`conf` | Configuration files for Pulsar, including for [broker configuration](reference-configuration.md#broker), [ZooKeeper configuration](reference-configuration.md#zookeeper), and more +`examples` | A Java JAR file containing example [Pulsar Functions](functions-overview.md) +`lib` | The [JAR](https://en.wikipedia.org/wiki/JAR_(file_format)) files used by Pulsar +`licenses` | License files, in `.txt` form, for various components of the Pulsar codebase + +These directories will be created once you begin running Pulsar: + +Directory | Contains +:---------|:-------- +`data` | The data storage directory used by ZooKeeper and BookKeeper +`instances` | Artifacts created for [Pulsar Functions](functions-overview.md) +`logs` | Logs created by the installation + + +## Deploying ZooKeeper + +Each Pulsar instance relies on two separate ZooKeeper quorums. + +* [Local ZooKeeper](#deploying-local-zookeeper) operates at the cluster level and provides cluster-specific configuration management and coordination. Each Pulsar cluster needs to have a dedicated ZooKeeper cluster. +* [Global ZooKeeper](#deploying-global-zookeeper) operates at the instance level and provides configuration management for the entire system (and thus across clusters). The global ZooKeeper quorum can be provided by an independent cluster of machines or by the same machines used by local ZooKeeper. + +### Deploying local ZooKeeper + +ZooKeeper manages a variety of essential coordination- and configuration-related tasks for Pulsar. + +Deploying a Pulsar instance requires you to stand up one local ZooKeeper cluster *per Pulsar cluster*. + +To begin, add all ZooKeeper servers to the quorum configuration specified in the [`conf/zookeeper.conf`](reference-configuration.md#zookeeper) file. Add a `server.N` line for each node in the cluster to the configuration, where `N` is the number of the ZooKeeper node. Here's an example for a three-node cluster: + +```properties +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=zk3.us-west.example.com:2888:3888 +``` + +On each host, you need to specify the ID of the node in each node's `myid` file, which is in each server's `data/zookeeper` folder by default (this can be changed via the [`dataDir`](reference-configuration.md#zookeeper-dataDir) parameter). + +> See the [Multi-server setup guide](https://zookeeper.apache.org/doc/r3.4.10/zookeeperAdmin.html#sc_zkMulitServerSetup) in the ZooKeeper documentation for detailed info on `myid` and more. + +On a ZooKeeper server at `zk1.us-west.example.com`, for example, you could set the `myid` value like this: + +```shell +$ mkdir -p data/zookeeper +$ echo 1 > data/zookeeper/myid +``` + +On `zk2.us-west.example.com` the command would be `echo 2 > data/zookeeper/myid` and so on. + +Once each server has been added to the `zookeeper.conf` configuration and has the appropriate `myid` entry, you can start ZooKeeper on all hosts (in the background, using nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```shell +$ bin/pulsar-daemon start zookeeper +``` + +### Deploying the configuration store + +The ZooKeeper cluster configured and started up in the section above is a *local* ZooKeeper cluster used to manage a single Pulsar cluster. In addition to a local cluster, however, a full Pulsar instance also requires a configuration store for handling some instance-level configuration and coordination tasks. + +If you're deploying a [single-cluster](#single-cluster-pulsar-instance) instance, then you will not need a separate cluster for the configuration store. If, however, you're deploying a [multi-cluster](#multi-cluster-pulsar-instance) instance, then you should stand up a separate ZooKeeper cluster for configuration tasks. + +#### Single-cluster Pulsar instance + +If your Pulsar instance will consist of just one cluster, then you can deploy a configuration store on the same machines as the local ZooKeeper quorum but running on different TCP ports. + +To deploy a ZooKeeper configuration store in a single-cluster instance, add the same ZooKeeper servers used by the local quorom to the configuration file in [`conf/global_zookeeper.conf`](reference-configuration.md#configuration-store) using the same method for [local ZooKeeper](#local-zookeeper), but make sure to use a different port (2181 is the default for ZooKeeper). Here's an example that uses port 2184 for a three-node ZooKeeper cluster: + +```properties +clientPort=2184 +server.1=zk1.us-west.example.com:2185:2186 +server.2=zk2.us-west.example.com:2185:2186 +server.3=zk3.us-west.example.com:2185:2186 +``` + +As before, create the `myid` files for each server on `data/global-zookeeper/myid`. + +#### Multi-cluster Pulsar instance + +When deploying a global Pulsar instance, with clusters distributed across different geographical regions, the global ZooKeeper serves as a highly available and strongly consistent metadata store that can tolerate failures and partitions spanning whole regions. + +The key here is to make sure the ZK quorum members are spread across at least 3 +regions and that other regions are running as observers. + +Again, given the very low expected load on the global ZooKeeper servers, we can +share the same hosts used for the local ZooKeeper quorum. + +For example, let's assume a Pulsar instance with the following clusters `us-west`, +`us-east`, `us-central`, `eu-central`, `ap-south`. Also let's assume, each cluster +will have its own local ZK servers named such as + +``` +zk[1-3].${CLUSTER}.example.com +``` + +In this scenario we want to pick the quorum participants from few clusters and +let all the others be ZK observers. For example, to form a 7 servers quorum, we +can pick 3 servers from `us-west`, 2 from `us-central` and 2 from `us-east`. + +This will guarantee that writes to global ZooKeeper will be possible even if one +of these regions is unreachable. + +The ZK configuration in all the servers will look like: + +```properties +clientPort=2184 +server.1=zk1.us-west.example.com:2185:2186 +server.2=zk2.us-west.example.com:2185:2186 +server.3=zk3.us-west.example.com:2185:2186 +server.4=zk1.us-central.example.com:2185:2186 +server.5=zk2.us-central.example.com:2185:2186 +server.6=zk3.us-central.example.com:2185:2186:observer +server.7=zk1.us-east.example.com:2185:2186 +server.8=zk2.us-east.example.com:2185:2186 +server.9=zk3.us-east.example.com:2185:2186:observer +server.10=zk1.eu-central.example.com:2185:2186:observer +server.11=zk2.eu-central.example.com:2185:2186:observer +server.12=zk3.eu-central.example.com:2185:2186:observer +server.13=zk1.ap-south.example.com:2185:2186:observer +server.14=zk2.ap-south.example.com:2185:2186:observer +server.15=zk3.ap-south.example.com:2185:2186:observer +``` + +Additionally, ZK observers will need to have: + +```properties +peerType=observer +``` + +##### Starting the service + +Once your global ZooKeeper configuration is in place, you can start up the service using [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) + +```shell +$ bin/pulsar-daemon start global-zookeeper +``` + +## Cluster metadata initialization + +Once you've set up the cluster-specific ZooKeeper and configuration store quorums for your instance, there is some metadata that needs to be written to ZooKeeper for each cluster in your instance. **It only needs to be written once**. + +You can initialize this metadata using the [`initialize-cluster-metadata`](reference-cli-tools.md#pulsar-initialize-cluster-metadata) command of the [`pulsar`](reference-cli-tools.md#pulsar) CLI tool. Here's an example: + +```shell +$ bin/pulsar initialize-cluster-metadata \ + --cluster us-west \ + --zookeeper zk1.us-west.example.com:2181 \ + --configuration-store zk1.us-west.example.com:2184 \ + --web-service-url http://pulsar.us-west.example.com:8080/ \ + --web-service-url-tls https://pulsar.us-west.example.com:8443/ \ + --broker-service-url pulsar://pulsar.us-west.example.com:6650/ \ + --broker-service-url-tls pulsar+ssl://pulsar.us-west.example.com:6651/ +``` + +As you can see from the example above, the following needs to be specified: + +* The name of the cluster +* The local ZooKeeper connection string for the cluster +* The configuration store connection string for the entire instance +* The web service URL for the cluster +* A broker service URL enabling interaction with the {% popover brokers %} in the cluster + +If you're using [TLS](administration-auth.md#tls-client-auth), you'll also need to specify a TLS web service URL for the cluster as well as a TLS broker service URL for the brokers in the cluster. + +Make sure to run `initialize-cluster-metadata` for each cluster in your instance. + +## Deploying BookKeeper + +BookKeeper provides [persistent message storage](getting-started-concepts-and-architecture.md#persistent-storage) for Pulsar. + +Each Pulsar broker needs to have its own cluster of bookies. The BookKeeper cluster shares a local ZooKeeper quorum with the Pulsar cluster. + +### Configuring bookies + +BookKeeper bookies can be configured using the [`conf/bookkeeper.conf`](reference-configuration.md#bookkeeper) configuration file. The most important aspect of configuring each bookie is ensuring that the [`zkServers`](reference-configuration.md#bookkeeper-zkServers) parameter is set to the connection string for the Pulsar cluster's local ZooKeeper. + +### Starting up bookies + +You can start up a bookie in two ways: in the foreground or as a background daemon. + +To start up a bookie in the foreground, use the [`bookeeper`](reference-cli-tools.md#bookkeeper) + +```shell +$ bin/pulsar-daemon start bookie +``` + +You can verify that the bookie is working properly using the `bookiesanity` command for the [BookKeeper shell](reference-cli-tools.md#bookkeeper-shell): + +```shell +$ bin/bookkeeper shell bookiesanity +``` + +This will create a new ledger on the local bookie, write a few entries, read them back and finally delete the ledger. + +### Hardware considerations + +Bookie hosts are responsible for storing message data on disk. In order for bookies to provide optimal performance, it's essential that they have a suitable hardware configuration. There are two key dimensions to bookie hardware capacity: + +* Disk I/O capacity read/write +* Storage capacity + +Message entries written to bookies are always synced to disk before returning an acknowledgement to the Pulsar broker. To ensure low write latency, BookKeeper is +designed to use multiple devices: + +* A **journal** to ensure durability. For sequential writes, it's critical to have fast [fsync](https://linux.die.net/man/2/fsync) operations on bookie hosts. Typically, small and fast [solid-state drives](https://en.wikipedia.org/wiki/Solid-state_drive) (SSDs) should suffice, or [hard disk drives](https://en.wikipedia.org/wiki/Hard_disk_drive) (HDDs) with a [RAID](https://en.wikipedia.org/wiki/RAID)s controller and a battery-backed write cache. Both solutions can reach fsync latency of ~0.4 ms. +* A **ledger storage device** is where data is stored until all consumers have acknowledged the message. Writes will happen in the background, so write I/O is not a big concern. Reads will happen sequentially most of the time and the backlog is drained only in case of consumer drain. To store large amounts of data, a typical configuration will involve multiple HDDs with a RAID controller. + + + +## Deploying brokers + +Once you've set up ZooKeeper, initialized cluster metadata, and spun up BookKeeper bookies, you can deploy brokers. + +### Broker configuration + +Brokers can be configured using the [`conf/broker.conf`](reference-configuration.md#broker) configuration file. + +The most important element of broker configuration is ensuring that each broker is aware of its local ZooKeeper quorum as well as the global ZooKeeper quorum. Make sure that you set the [`zookeeperServers`](reference-configuration.md#broker-zookeeperServers) parameter to reflect the local quorum and the [`configurationStoreServers`](reference-configuration.md#broker-configurationStoreServers) parameter to reflect the configuration store quorum (although you'll need to specify only those ZooKeeper servers located in the same cluster). + +You also need to specify the name of the {% popover cluster %} to which the broker belongs using the [`clusterName`](reference-configuration.md#broker-clusterName) parameter. + +Here's an example configuration: + +```properties +# Local ZooKeeper servers +zookeeperServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 + +# Configuration store quorum connection string. +configurationStoreServers=zk1.us-west.example.com:2184,zk2.us-west.example.com:2184,zk3.us-west.example.com:2184 + +clusterName=us-west +``` + +### Broker hardware + +Pulsar brokers do not require any special hardware since they don't use the local disk. Fast CPUs and 10Gbps [NIC](https://en.wikipedia.org/wiki/Network_interface_controller) are recommended since the software can take full advantage of that. + +### Starting the broker service + +You can start a broker in the background using [nohup](https://en.wikipedia.org/wiki/Nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```shell +$ bin/pulsar-daemon start broker +``` + +You can also start brokers in the foreground using [`pulsar broker`](reference-cli-tools.md#pulsar-broker): + +```shell +$ bin/pulsar broker +``` + +## Service discovery + +[Clients](getting-started-clients.md) connecting to Pulsar brokers need to be able to communicate with an entire Pulsar instance using a single URL. Pulsar provides a built-in service discovery mechanism that you can set up using the instructions [immediately below](#service-discovery-setup). + +You can also use your own service discovery system if you'd like. If you use your own system, there is just one requirement: when a client performs an HTTP request to an [endpoint](reference-configuration.md) for a Pulsar cluster, such as `http://pulsar.us-west.example.com:8080`, the client needs to be redirected to *some* active broker in the desired cluster, whether via DNS, an HTTP or IP redirect, or some other means. + +> #### Service discovery already provided by many scheduling systems +> Many large-scale deployment systems, such as [Kubernetes](deploy-kubernetes), have service discovery systems built in. If you're running Pulsar on such a system, you may not need to provide your own service discovery mechanism. + + +### Service discovery setup + +The service discovery mechanism included with Pulsar maintains a list of active brokers, stored in ZooKeeper, and supports lookup using HTTP and also Pulsar's [binary protocol](developing-binary-protocol.md). + +To get started setting up Pulsar's built-in service discovery, you need to change a few parameters in the [`conf/discovery.conf`](reference-configuration.md#service-discovery) configuration file. Set the [`zookeeperServers`](reference-configuration.md#service-discovery-zookeeperServers) parameter to the cluster's ZooKeeper quorum connection string and the [`configurationStoreServers`](reference-configuration.md#service-discovery-configurationStoreServers) setting to the {% popover configuration store %} quorum connection string. + +```properties +# Zookeeper quorum connection string +zookeeperServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 + +# Global configuration store connection string +configurationStoreServers=zk1.us-west.example.com:2184,zk2.us-west.example.com:2184,zk3.us-west.example.com:2184 +``` + +To start the discovery service: + +```shell +$ bin/pulsar-daemon start discovery +``` + + + +## Admin client and verification + +At this point your Pulsar instance should be ready to use. You can now configure client machines that can serve as [administrative clients](admin-api-overview.md) for each cluster. You can use the [`conf/client.conf`](reference-configuration.md#client) configuration file to configure admin clients. + +The most important thing is that you point the [`serviceUrl`](reference-configuration.md#client-serviceUrl) parameter to the correct service URL for the cluster: + +```properties +serviceUrl=http://pulsar.us-west.example.com:8080/ +``` + +## Provisioning new tenants + +Pulsar was built as a fundamentally multi-tenant system. + +To allow a new tenant to use the system, we need to create a new one. You can create a new tenant using the [`pulsar-admin`](reference-pulsar-admin.md#tenants-create) CLI tool: + +```shell +$ bin/pulsar-admin tenants create test-tentant \ + --allowed-clusters us-west \ + --admin-roles test-admin-role +``` + +This will allow users who identify with role `test-admin-role` to administer the configuration for the tenant `test` which will only be allowed to use the cluster `us-west`. From now on, this tenant will be able to self-manage its resources. + +Once a tenant has been created, you will need to create {% popover namespaces %} for topics within that tenant. + +The first step is to create a namespace. A namespace is an administrative unit that can contain many topics. A common practice is to create a namespace for each different use case from a single tenant. + +```shell +$ bin/pulsar-admin namespaces create test-tenant/ns1 +``` + +##### Testing producer and consumer + +Everything is now ready to send and receive messages. The quickest way to test +the system is through the `pulsar-perf` client tool. + +Let's use a topic in the namespace we just created. Topics are automatically +created the first time a producer or a consumer tries to use them. + +The topic name in this case could be: + +```http +persistent://test-tenant/ns1/my-topic +``` + +Start a consumer that will create a subscription on the topic and will wait +for messages: + +```shell +$ bin/pulsar-perf consume persistent://test-tenant/us-west/ns1/my-topic +``` + +Start a producer that publishes messages at a fixed rate and report stats every +10 seconds: + +```shell +$ bin/pulsar-perf produce persistent://test-tenant/us-west/ns1/my-topic +``` + +To report the topic stats: + +```shell +$ bin/pulsar-admin persistent stats persistent://test-tenant/us-west/ns1/my-topic +``` diff --git a/site2/docs/deploy-bare-metal.md b/site2/docs/deploy-bare-metal.md new file mode 100644 index 0000000000000000000000000000000000000000..6fb1f1d4122503cc5d64d10fae22c9d88911aa66 --- /dev/null +++ b/site2/docs/deploy-bare-metal.md @@ -0,0 +1,226 @@ +--- +id: deploy-bare-metal +title: Deploying a cluster on bare metal +sidebar_label: Bare metal +--- + +> Single-cluster Pulsar installations should be sufficient for all but the most ambitious use cases. If you're interested in experimenting with Pulsar or using it in a startup or on a single team, we recommend opting for a single cluster. If you do need to run a multi-cluster Pulsar instance, however, see the guide [here](deploy-bare-metal-multi-cluster.md). + +Deploying a Pulsar cluster involves doing the following (in order): + +* Deploying a [ZooKeeper](#deploying-a-zookeeper-cluster) cluster +* Initializing [cluster metadata](#initializing-cluster-metadata) +* Deploying a [BookKeeper](#deploying-a-bookkeeper-cluster) cluster +* Deploying one or more Pulsar [brokers](#deploying-pulsar-brokers) + +### Requirements + +To run Pulsar on bare metal, you will need: + +* At least 6 Linux machines or VMs + * 3 running [ZooKeeper](https://zookeeper.apache.org) + * 3 running a Pulsar broker and a [BookKeeper](https://bookkeeper.apache.org) bookie +* A single [DNS](https://en.wikipedia.org/wiki/Domain_Name_System) name covering all of the Pulsar broker hosts + +Each machine in your cluster will need to have [Java 8](http://www.oracle.com/technetwork/java/javase/downloads/index.html) or higher installed. + +Here's a diagram showing the basic setup: + +![alt-text](/docs/assets/pulsar-basic-setup.png) + +In this diagram, connecting clients need to be able to communicate with the Pulsar cluster using a single URL, in this case `pulsar-cluster.acme.com`, that abstracts over all of the message-handling brokers. Pulsar message brokers run on machines alongside BookKeeper bookies; brokers and bookies, in turn, rely on ZooKeeper. + +### Hardware considerations + +When deploying a Pulsar cluster, we have some basic recommendations that you should keep in mind when capacity planning. + +For machines running ZooKeeper, we recommend using lighter-weight machines or VMs. Pulsar uses ZooKeeper only for periodic coordination- and configuration-related tasks, *not* for basic operations. If you're running Pulsar on [Amazon Web Services](https://aws.amazon.com/) (AWS), for example, a [t2.small](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/t2-instances.html) instance would likely suffice. + +For machines running a bookie and a Pulsar broker, we recommend using more powerful machines. For an AWS deployment, for example, [i3.4xlarge](https://aws.amazon.com/blogs/aws/now-available-i3-instances-for-demanding-io-intensive-applications/) instances may be appropriate. On those machines we also recommend: + +* Fast CPUs and 10Gbps [NIC](https://en.wikipedia.org/wiki/Network_interface_controller) (for Pulsar brokers) +* Small and fast [solid-state drives](https://en.wikipedia.org/wiki/Solid-state_drive) (SSDs) or [hard disk drives](https://en.wikipedia.org/wiki/Hard_disk_drive) (HDDs) with a [RAID](https://en.wikipedia.org/wiki/RAID) controller and a battery-backed write cache (for BookKeeper bookies) + +## Installing the Pulsar binary package + +> You'll need to install the Pulsar binary package on *each machine in the cluster*, including machines running [ZooKeeper](#deploying-a-zookeeper-cluster) and [BookKeeper](#deploying-a-bookkeeper-cluster). + +To get started deploying a Pulsar cluster on bare metal, you'll need to download a binary tarball release in one of the following ways: + +* By clicking on the link directly below, which will automatically trigger a download: + * Pulsar pulsar:version binary release +* From the Pulsar [downloads page](pulsar:download_page_url) +* From the Pulsar [releases page](https://github.com/apache/incubator-pulsar/releases/latest) on [GitHub](https://github.com) +* Using [wget](https://www.gnu.org/software/wget): + +```bash +$ wget http://archive.apache.org/dist/incubator/pulsar/pulsar-pulsar:version/apache-pulsar-pulsar:version-bin.tar.gz +``` + +Once you've downloaded the tarball, untar it and `cd` into the resulting directory: + +```bash +$ tar xvzf apache-pulsar-pulsar:version-bin.tar.gz +$ cd apache-pulsar-pulsar:version +``` + +The untarred directory contains the following subdirectories: + +Directory | Contains +:---------|:-------- +`bin` | Pulsar's [command-line tools](reference-cli-tools.md), such as [`pulsar`](reference-cli-tools.md#pulsar) and [`pulsar-admin`](reference-pulsar-admin.md) +`conf` | Configuration files for Pulsar, including for [broker configuration](reference-configuration.md#broker), [ZooKeeper configuration](reference-configuration.md#zookeeper), and more +`data` | The data storage directory used by ZooKeeper and BookKeeper. +`lib` | The [JAR](https://en.wikipedia.org/wiki/JAR_(file_format)) files used by Pulsar. +`logs` | Logs created by the installation. + +## Deploying a ZooKeeper cluster + +[ZooKeeper](https://zookeeper.apache.org) manages a variety of essential coordination- and configuration-related tasks for Pulsar. To deploy a Pulsar cluster you'll need to deploy ZooKeeper first (before all other components). We recommend deploying a 3-node ZooKeeper cluster. Pulsar does not make heavy use of ZooKeeper, so more lightweight machines or VMs should suffice for running ZooKeeper. + +To begin, add all ZooKeeper servers to the configuration specified in [`conf/zookeeper.conf`](reference-configuration.md#zookeeper) (in the Pulsar directory you created [above](#installing-the-pulsar-binary-package)). Here's an example: + +```properties +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=zk3.us-west.example.com:2888:3888 +``` + +On each host, you need to specify the ID of the node in each node's `myid` file, which is in each server's `data/zookeeper` folder by default (this can be changed via the [`dataDir`](reference-configuration.md#zookeeper-dataDir) parameter). + +> See the [Multi-server setup guide](https://zookeeper.apache.org/doc/r3.4.10/zookeeperAdmin.html#sc_zkMulitServerSetup) in the ZooKeeper documentation for detailed info on `myid` and more. + +On a ZooKeeper server at `zk1.us-west.example.com`, for example, you could set the `myid` value like this: + +```bash +$ mkdir -p data/zookeeper +$ echo 1 > data/zookeeper/myid +``` + +On `zk2.us-west.example.com` the command would be `echo 2 > data/zookeeper/myid` and so on. + +Once each server has been added to the `zookeeper.conf` configuration and has the appropriate `myid` entry, you can start ZooKeeper on all hosts (in the background, using nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash +$ bin/pulsar-daemon start zookeeper +``` + +## Initializing cluster metadata + +Once you've deployed ZooKeeper for your cluster, there is some metadata that needs to be written to ZooKeeper for each cluster in your instance. It only needs to be written **once**. + +You can initialize this metadata using the [`initialize-cluster-metadata`](reference-cli-tools.md#pulsar-initialize-cluster-metadata) command of the [`pulsar`](reference-cli-tools.md#pulsar) CLI tool. This command can be run on any machine in your ZooKeeper cluster. Here's an example: + +```shell +$ bin/pulsar initialize-cluster-metadata \ + --cluster pulsar-cluster-1 \ + --zookeeper zk1.us-west.example.com:2181 \ + --configuration-store zk1.us-west.example.com:2181 \ + --web-service-url http://pulsar.us-west.example.com:8080 \ + --web-service-url-tls https://pulsar.us-west.example.com:8443 \ + --broker-service-url pulsar://pulsar.us-west.example.com:6650 \ + --broker-service-url-tls pulsar+ssl://pulsar.us-west.example.com:6651 +``` + +As you can see from the example above, the following needs to be specified: + +Flag | Description +:----|:----------- +`--cluster` | A name for the cluster +`--zookeeper` | A "local" ZooKeeper connection string for the cluster. This connection string only needs to include *one* machine in the ZooKeeper cluster. +`--configuration-store` | The configuration store connection string for the entire instance. As with the `--zookeeper` flag, this connection string only needs to include *one* machine in the ZooKeeper cluster. +`--web-service-url` | The web service URL for the cluster, plus a port. This URL should be a standard DNS name. The default port is 8080 (we don't recommend using a different port). +`--web-service-url-tls` | If you're using [TLS](administration-auth.md#tls-client-auth), you'll also need to specify a TLS web service URL for the cluster. The default port is 8443 (we don't recommend using a different port). +`--broker-service-url` | A broker service URL enabling interaction with the brokers in the cluster. This URL should use the same DNS name as the web service URL but should use the `pulsar` scheme instead. The default port is 6650 (we don't recommend using a different port). +`--broker-service-url-tls` | If you're using [TLS](administration-auth.md#tls-client-auth), you'll also need to specify a TLS web service URL for the cluster as well as a TLS broker service URL for the brokers in the cluster. The default port is 6651 (we don't recommend using a different port). + +## Deploying a BookKeeper cluster + +[BookKeeper](https://bookkeeper.apache.org) handles all persistent data storage in Pulsar. You will need to deploy a cluster of BookKeeper bookies to use Pulsar. We recommend running a **3-bookie BookKeeper cluster**. + +BookKeeper bookies can be configured using the [`conf/bookkeeper.conf`](reference-configuration.md#bookkeeper) configuration file. The most important step in configuring bookies for our purposes here is ensuring that the [`zkServers`](reference-configuration.md#bookkeeper-zkServers) is set to the connection string for the ZooKeeper cluster. Here's an example: + +```properties +zkServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 +``` + +Once you've appropriately modified the `zkServers` parameter, you can provide any other configuration modifications you need. You can find a full listing of the available BookKeeper configuration parameters [here](reference-configuration.md#bookkeeper), although we would recommend consulting the [BookKeeper documentation](http://bookkeeper.apache.org/docs/latest/reference/config/) for a more in-depth guide. + +Once you've applied the desired configuration in `conf/bookkeeper.conf`, you can start up a bookie on each of your BookKeeper hosts. You can start up each bookie either in the background, using [nohup](https://en.wikipedia.org/wiki/Nohup), or in the foreground. + +To start the bookie in the background, use the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash +$ bin/pulsar-daemon start bookie +``` + +To start the bookie in the foreground: + +```bash +$ bin/bookkeeper bookie +``` + +You can verify that the bookie is working properly using the `bookiesanity` command for the [BookKeeper shell](http://localhost:4000/docs/latest/deployment/reference/CliTools#bookkeeper-shell): + +```bash +$ bin/bookkeeper shell bookiesanity +``` + +This will create an ephemeral BookKeeper ledger on the local bookie, write a few entries, read them back, and finally delete the ledger. + +## Deploying Pulsar brokers + +Pulsar brokers are the last thing you need to deploy in your Pulsar cluster. Brokers handle Pulsar messages and provide Pulsar's administrative interface. We recommend running **3 brokers**, one for each machine that's already running a BookKeeper bookie. + +The most important element of broker configuration is ensuring that that each broker is aware of the ZooKeeper cluster that you've deployed. Make sure that the [`zookeeperServers`](reference-configuration.md#broker-zookeeperServers) and [`configurationStoreServers`](reference-configuration.md#broker-configurationStoreServers) parameters. In this case, since we only have 1 cluster and no configuration store setup, the `configurationStoreServers` will point to the same `zookeeperServers`. + +```properties +zookeeperServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 +configurationStoreServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 +``` + +You also need to specify the cluster name (matching the name that you provided when [initializing the cluster's metadata](#initializing-cluster-metadata): + +```properties +clusterName=pulsar-cluster-1 +``` + +You can then provide any other configuration changes that you'd like in the [`conf/broker.conf`](reference-configuration.md#broker) file. Once you've decided on a configuration, you can start up the brokers for your Pulsar cluster. Like ZooKeeper and BookKeeper, brokers can be started either in the foreground or in the background, using nohup. + +You can start a broker in the foreground using the [`pulsar broker`](reference-cli-tools.md#pulsar-broker) command: + +```bash +$ bin/pulsar broker +``` + +You can start a broker in the background using the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash +$ bin/pulsar-daemon start broker +``` + +Once you've succesfully started up all the brokers you intend to use, your Pulsar cluster should be ready to go! + +## Connecting to the running cluster + +Once your Pulsar cluster is up and running, you should be able to connect with it using Pulsar clients. One such client is the [`pulsar-client`](reference-cli-tools.md#pulsar-client) tool, which is included with the Pulsar binary package. The `pulsar-client` tool can publish messages to and consume messages from Pulsar topics and thus provides a simple way to make sure that your cluster is runnning properly. + +To use the `pulsar-client` tool, first modify the client configuration file in [`conf/client.conf`](reference-configuration.md#client) in your binary package. You'll need to change the values for `webServiceUrl` and `brokerServiceUrl`, substituting `localhost` (which is the default), with the DNS name that you've assigned to your broker/bookie hosts. Here's an example: + +```properties +webServiceUrl=http://us-west.example.com:8080/ +brokerServiceurl=pulsar://us-west.example.com:6650/ +``` + +Once you've done that, you can publish a message to Pulsar topic: + +```bash +$ bin/pulsar-client produce \ + persistent://public/default/test \ + -n 1 \ + -m "Hello, Pulsar" +``` + +> You may need to use a different cluster name in the topic if you specified a cluster name different from `pulsar-cluster-1`. + +This will publish a single message to the Pulsar topic. diff --git a/site2/docs/deploy-dcos.md b/site2/docs/deploy-dcos.md new file mode 100644 index 0000000000000000000000000000000000000000..62dfd8150d1b26a5c852b396de90b8bfe0a3ddf4 --- /dev/null +++ b/site2/docs/deploy-dcos.md @@ -0,0 +1,177 @@ +--- +id: deploy-dcos +title: Deploying Pulsar on DC/OS +sidebar_label: DC/OS +--- + +[DC/OS](https://dcos.io/) (the DataCenter Operating System) is a distributed operating system used for deploying and managing applications and systems on [Apache Mesos](http://mesos.apache.org/). DC/OS is an open-source tool created and maintained by [Mesosphere](https://mesosphere.com/). + +Apache Pulsar is available as a [Marathon Application Group](https://mesosphere.github.io/marathon/docs/application-groups.html), which runs multiple applications as manageable sets. + +## Prerequisites + +In order to run Pulsar on DC/OS, you will need the following: + +* DC/OS version [1.9](https://docs.mesosphere.com/1.9/) or higher +* A [DC/OS cluster](https://docs.mesosphere.com/1.9/installing/) with at least three agent nodes +* The [DC/OS CLI tool](https://docs.mesosphere.com/1.9/cli/install/) installed +* The [`PulsarGroups.json`](https://github.com/apache/incubator-pulsar/blob/master/deployment/dcos/PulsarGroups.json) configuration file from the Pulsar GitHub repo. + + ```bash + $ curl -O https://raw.githubusercontent.com/apache/incubator-pulsar/master/deployment/dcos/PulsarGroups.json + ``` + +Each node in the DC/OS-managed Mesos cluster must have at least: + +* 4 CPU +* 4 GB of memory +* 60 GB of total persistent disk + +Alternatively, you can change the configuration in `PulsarGroups.json` according to match your DC/OS cluster's resources. + +## Deploy Pulsar using the DC/OS command interface + +You can deploy Pulsar on DC/OS using this command: + +```bash +$ dcos marathon group add PulsarGroups.json +``` + +This command will deploy Docker container instances in three groups, which together comprise a Pulsar cluster: + +* 3 bookies (1 {% popover bookie %} on each agent node and 1 [bookie recovery](http://bookkeeper.apache.org/docs/latest/admin/autorecovery/) instance) +* 3 Pulsar {% popover brokers %} (1 broker on each node and 1 admin instance) +* 1 [Prometheus](http://prometheus.io/) instance and 1 [Grafana](https://grafana.com/) instance + + +> When running DC/OS, a ZooKeeper cluster is already running at `master.mesos:2181`, thus there's no need to install or start up ZooKeeper separately. + +After executing the `dcos` command above, click on the **Services** tab in the DC/OS [GUI interface](https://docs.mesosphere.com/latest/gui/), which you can access at [http://m1.dcos](http://m1.dcos) in this example. You should see several applications in the process of deploying. + +![DC/OS command executed](/docs/assets/dcos_command_execute.png) + +![DC/OS command executed2](/docs/assets/dcos_command_execute2.png) + +## The BookKeeper group + +To monitor the status of the BookKeeper cluster deployment, click on the **bookkeeper** group in the parent **pulsar** group. + +![DC/OS bookkeeper status](/docs/assets/dcos_bookkeeper_status.png) + +At this point, 3 {% popover bookies %} should be shown as green, which means that they have been deployed successfully and are now running. + +![DC/OS bookkeeper running](/docs/assets/dcos_bookkeeper_run.png) + +You can also click into each bookie instance to get more detailed info, such as the bookie running log. + +![DC/OS bookie log](/docs/assets/dcos_bookie_log.png) + +To display information about the BookKeeper in ZooKeeper, you can visit [http://m1.dcos/exhibitor](http://m1.dcos/exhibitor). In this example, there are 3 bookies under the `available` directory. + +![DC/OS bookkeeper in zk](/docs/assets/dcos_bookkeeper_in_zookeeper.png) + +## The Pulsar broker Group + +Similar to the BookKeeper group above, click into the **brokers** to check the status of the Pulsar brokers. + +![DC/OS broker status](/docs/assets/dcos_broker_status.png) + +![DC/OS broker running](/docs/assets/dcos_broker_run.png) + +You can also click into each broker instance to get more detailed info, such as the broker running log. + +![DC/OS broker log](/docs/assets/dcos_broker_log.png) + +Broker cluster information in Zookeeper is also available through the web UI. In this example, you can see that that the `loadbalance` and `managed-ledgers` directories have been created. + +![DC/OS broker in zk](/docs/assets/dcos_broker_in_zookeeper.png) + +## Monitor Group + +The **monitory** group consists of Prometheus and Grafana. + +![DC/OS monitor status](/docs/assets/dcos_monitor_status.png) + +### Prometheus + +Click into the instance of `prom` to get the endpoint of Prometheus, which is `192.168.65.121:9090` in this example. + +![DC/OS prom endpoint](/docs/assets/dcos_prom_endpoint.png) + +If you click that endpoint, you'll see the Prometheus dashboard. The [http://192.168.65.121:9090/targets](http://192.168.65.121:9090/targets) URL will display all the bookies and brokers. + +![DC/OS prom targets](/docs/assets/dcos_prom_targets.png) + +### Grafana + +Click into `grafana` to get the endpoint for Grafana, which is `192.168.65.121:3000` in this example. + +![DC/OS grafana endpoint](/docs/assets/dcos_grafana_endpoint.png) + +If you click that endpoint, you can access the Grafana dashbaord. + +![DC/OS grafana targets](/docs/assets/dcos_grafana_dashboard.png) + +## Run a simple Pulsar consumer and producer on DC/OS + +Now that we have a fully deployed Pulsar cluster, we can run a simple consumer and producer to show Pulsar on DC/OS in action. + +### Download and prepare the Pulsar Java tutorial + +There's a [Pulsar Java tutorial](https://github.com/streamlio/pulsar-java-tutorial) repo that you can clone. This repo contains a simple Pulsar consumer and producer (more info can be found in the repo's `README` file). + +```bash +$ git clone https://github.com/streamlio/pulsar-java-tutorial +``` + +Change the `SERVICE_URL` from `pulsar://localhost:6650` to `pulsar://a1.dcos:6650` in both [`ConsumerTutorial.java`](https://github.com/streamlio/pulsar-java-tutorial/blob/master/src/main/java/tutorial/ConsumerTutorial.java) and [`ProducerTutorial.java`](https://github.com/streamlio/pulsar-java-tutorial/blob/master/src/main/java/tutorial/ProducerTutorial.java). +The `pulsar://a1.dcos:6650` endpoint is for the broker service. Endpoint details for each broker instance can be fetched from the DC/OS GUI. `a1.dcos` is a DC/OS client agent, which runs a broker. This can also be replaced by the client agent IP address. + +Now, change the message number from 10 to 10000000 in main method of [`ProducerTutorial.java`](https://github.com/streamlio/pulsar-java-tutorial/blob/master/src/main/java/tutorial/ProducerTutorial.java) so that it will produce more messages. + +Now compile the project code using command: + +```bash +$ mvn clean package +``` + +### Run the consumer and producer + +Execute this command to run the consumer: + +```bash +$ mvn exec:java -Dexec.mainClass="tutorial.ConsumerTutorial" +``` + +Execute this command to run the producer: + +```bash +$ mvn exec:java -Dexec.mainClass="tutorial.ProducerTutorial" +``` + +You can see the producer producing messages and the consumer consuming messages through the DC/OS GUI. + +![DC/OS pulsar producer](/docs/assets/dcos_producer.png) + +![DC/OS pulsar consumer](/docs/assets/dcos_consumer.png) + +### View Grafana metric output + +While the producer and consumer are running, you can access running metrics information from Grafana. + +![DC/OS pulsar dashboard](/docs/assets/dcos_metrics.png) + + +## Uninstall Pulsar + +You can shut down and uninstall the `pulsar` application from DC/OS at any time in two ways: + +1. Using the DC/OS GUI, you can choose **Delete** at the right end of Pulsar group. + + ![DC/OS pulsar uninstall](/docs/assets/dcos_uninstall.png) + +2. You can use the following command: + + ```bash + $ dcos marathon group remove /pulsar + ``` diff --git a/site2/docs/deploy-kubernetes.md b/site2/docs/deploy-kubernetes.md new file mode 100644 index 0000000000000000000000000000000000000000..75f2d920c9db0c653bf0c679b682c93350a33528 --- /dev/null +++ b/site2/docs/deploy-kubernetes.md @@ -0,0 +1,317 @@ +--- +id: deploy-kubernetes +title: Deploying Pulsar on Kubernetes +sidebar_label: Kubernetes +--- + +Pulsar can be easily deployed in [Kubernetes](https://kubernetes.io/) clusters, either in managed clusters on [Google Kubernetes Engine](#pulsar-on-google-kubernetes-engine) or [Amazon Web Services](https://aws.amazon.com/) or in [custom clusters](#pulsar-on-a-custom-kubernetes-cluster). + +The deployment method shown in this guide relies on [YAML](http://yaml.org/) definitions for Kubernetes [resources](https://kubernetes.io/docs/reference/). The [`kubernetes`](pulsar:repo_url/kubernetes) subdirectory of the [Pulsar package](pulsar:download_page_url) holds resource definitions for: + +* A two-bookie BookKeeper cluster +* A three-node ZooKeeper cluster +* A three-broker Pulsar cluster +* A [monitoring stack]() consisting of [Prometheus](https://prometheus.io/), [Grafana](https://grafana.com), and the [Pulsar dashboard](administration-dashboard.md) +* A [pod](https://kubernetes.io/docs/concepts/workloads/pods/pod/) from which you can run administrative commands using the [`pulsar-admin`](reference-pulsar-admin.md) CLI tool + +## Setup + +To get started, install a source package from the [downloads page](pulsar:download_page_url). + +> Please note that the Pulsar binary package will *not* contain the necessary YAML resources to deploy Pulsar on Kubernetes. + +If you'd like to change the number of bookies, brokers, or ZooKeeper nodes in your Pulsar cluster, modify the `replicas` parameter in the `spec` section of the appropriate [`Deployment`](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/) or [`StatefulSet`](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/) resource. + +## Pulsar on Google Kubernetes Engine + +[Google Kubernetes Engine](https://cloud.google.com/kubernetes-engine) (GKE) automates the creation and management of Kubernetes clusters in [Google Compute Engine](https://cloud.google.com/compute/) (GCE). + +### Prerequisites + +To get started, you'll need: + +* A Google Cloud Platform account, which you can sign up for at [cloud.google.com](https://cloud.google.com) +* An existing Cloud Platform project +* The [Google Cloud SDK](https://cloud.google.com/sdk/downloads) (in particular the [`gcloud`](https://cloud.google.com/sdk/gcloud/) and [`kubectl`]() tools). + +### Create a new Kubernetes cluster + +You can create a new GKE cluster using the [`container clusters create`](https://cloud.google.com/sdk/gcloud/reference/container/clusters/create) command for `gcloud`. This command enables you to specify the number of nodes in the cluster, the machine types of those nodes, and more. + +As an example, we'll create a new GKE cluster for Kubernetes version [1.6.4](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG.md#v164) in the [us-central1-a](https://cloud.google.com/compute/docs/regions-zones/regions-zones#available) zone. The cluster will be named `pulsar-gke-cluster` and will consist of three VMs, each using two locally attached SSDs and running on [n1-standard-8](https://cloud.google.com/compute/docs/machine-types) machines. These SSDs will be used by {% popover bookie %} instances, one for the BookKeeper [journal](getting-started-concepts-and-architecture.md#journal-storage) and the other for storing the actual message data. + +```bash +$ gcloud container clusters create pulsar-gke-cluster \ + --zone=us-central1-a \ + --machine-type=n1-standard-8 \ + --num-nodes=3 \ + --local-ssd-count=2 \ +``` + +By default, bookies will run on all the machines that have locally attached SSD disks. In this example, all of those machines will have two SSDs, but you can add different types of machines to the cluster later. You can control which machines host bookie servers using [labels](https://kubernetes.io/docs/concepts/overview/working-with-objects/labels). + +### Dashboard + +You can observe your cluster in the [Kubernetes Dashboard](https://kubernetes.io/docs/tasks/access-application-cluster/web-ui-dashboard/) by downloading the credentials for your Kubernetes cluster and opening up a proxy to the cluster: + +```bash +$ gcloud container clusters get-credentials pulsar-gke-cluster \ + --zone=us-central1-a \ + --project=your-project-name +$ kubectl proxy +``` + +By default, the proxy will be opened on port 8001. Now you can navigate to [localhost:8001/ui](http://localhost:8001/ui) in your browser to access the dashboard. At first your GKE cluster will be empty, but that will change as you begin deploying Pulsar [components](#deploying-pulsar-components). + +## Pulsar on Amazon Web Services + +You can run Kubernetes on [Amazon Web Services](https://aws.amazon.com/) (AWS) in a variety of ways. A very simple way that was [recently introduced](https://aws.amazon.com/blogs/compute/kubernetes-clusters-aws-kops/) involves using the [Kubernetes Operations](https://github.com/kubernetes/kops) (kops) tool. + +You can find detailed instructions for setting up a Kubernetes cluster on AWS [here](https://github.com/kubernetes/kops/blob/master/docs/aws.md). + +When you create a cluster using those instructions, your `kubectl` config in `~/.kube/config` (on MacOS and Linux) will be updated for you, so you probably won't need to change your configuration. Nonetheless, you can ensure that `kubectl` can interact with your cluster by listing the nodes in the cluster: + +```bash +$ kubectl get nodes +``` + +If `kubectl` is working with your cluster, you can proceed to [deploy Pulsar components](#deploying-pulsar-components). + +## Pulsar on a custom Kubernetes cluster + +Pulsar can be deployed on a custom, non-GKE Kubernetes cluster as well. You can find detailed documentation on how to choose a Kubernetes installation method that suits your needs in the [Picking the Right Solution](https://kubernetes.io/docs/setup/pick-right-solution) guide in the Kubernetes docs. + +### Local cluster + +The easiest way to run a Kubernetes cluster is to do so locally. To install a mini local cluster for testing purposes, running in local VMs, you can either: + +1. Use [minikube](https://kubernetes.io/docs/getting-started-guides/minikube/) to run a single-node Kubernetes cluster +1. Create a local cluster running on multiple VMs on the same machine + +For the second option, follow the [instructions](https://github.com/pires/kubernetes-vagrant-coreos-cluster) for running Kubernetes using [CoreOS](https://coreos.com/) on [Vagrant](https://www.vagrantup.com/). We'll provide an abridged version of those instructions here. + + +First, make sure you have [Vagrant](https://www.vagrantup.com/downloads.html) and [VirtualBox](https://www.virtualbox.org/wiki/Downloads) installed. Then clone the repo and start up the cluster: + +```bash +$ git clone https://github.com/pires/kubernetes-vagrant-coreos-cluster +$ cd kubernetes-vagrant-coreos-cluster + +# Start a three-VM cluster +$ NODES=3 USE_KUBE_UI=true vagrant up +``` + +Create SSD disk mount points on the VMs using this script: + +```bash +$ for vm in node-01 node-02 node-03; do + NODES=3 vagrant ssh $vm -c "sudo mkdir -p /mnt/disks/ssd0" + NODES=3 vagrant ssh $vm -c "sudo mkdir -p /mnt/disks/ssd1" + done +``` + +Bookies expect two logical devices to mount for [journal](getting-started-concepts-and-architecture.md#journal-storage) and persistent message storage to be available. In this VM exercise, we created two directories on each VM. + +Once the cluster is up, you can verify that `kubectl` can access it: + +```bash +$ kubectl get nodes +NAME STATUS AGE VERSION +172.17.8.101 Ready,SchedulingDisabled 10m v1.6.4 +172.17.8.102 Ready 8m v1.6.4 +172.17.8.103 Ready 6m v1.6.4 +172.17.8.104 Ready 4m v1.6.4 +``` + +### Dashboard + +In order to use the [Kubernetes Dashboard](https://kubernetes.io/docs/tasks/access-application-cluster/web-ui-dashboard/) with your local Kubernetes cluster, first use `kubectl` to create a proxy to the cluster: + +```bash +$ kubectl proxy +``` + +Now you can access the web interface at [localhost:8001/ui](http://localhost:8001/ui). At first your local cluster will be empty, but that will change as you begin deploying Pulsar [components](#deploying-pulsar-components). + +## Deploying Pulsar components + +Now that you've set up a Kubernetes cluster, either on [Google Kubernetes Engine](#pulsar-on-google-kubernetes-engine) or on a [custom cluster](#pulsar-on-a-custom-kubernetes-cluster), you can begin deploying the components that make up Pulsar. The YAML resource definitions for Pulsar components can be found in the `kubernetes` folder of the [Pulsar source package](pulsar:download_page_url). + +In that package, there are two sets of resource definitions, one for Google Kubernetes Engine (GKE) in the `deployment/kubernetes/google-kubernetes-engine` folder and one for a custom Kubernetes cluster in the `deployment/kubernetes/generic` folder. To begin, `cd` into the appropriate folder. + +### ZooKeeper + +You *must* deploy ZooKeeper as the first Pulsar component, as it is a dependency for the others. + +```bash +$ kubectl apply -f zookeeper.yaml +``` + +Wait until all three ZooKeeper server pods are up and have the status `Running`. You can check on the status of the ZooKeeper pods at any time: + +```bash +$ kubectl get pods -l component=zookeeper +NAME READY STATUS RESTARTS AGE +zk-0 1/1 Running 0 18m +zk-1 1/1 Running 0 17m +zk-2 0/1 Running 6 15m +``` + +This step may take several minutes, as Kubernetes needs to download the Docker image on the VMs. + +#### Initialize cluster metadata + +Once ZooKeeper is running, you need to [initialize the metadata](#cluster-metadata-initialization) for the Pulsar cluster in ZooKeeper. This includes system metadata for {% popover BookKeeper %} and Pulsar more broadly. There is a Kubernetes [job](https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/) in the `cluster-metadata.yaml` file that you only need to run once: + +```bash +$ kubectl apply -f cluster-metadata.yaml +``` + +For the sake of reference, that job runs the following command on an ephemeral pod: + +```bash +$ bin/pulsar initialize-cluster-metadata \ + --cluster us-central \ + --zookeeper zookeeper \ + --global-zookeeper zookeeper \ + --web-service-url http://broker.default.svc.cluster.local:8080/ \ + --broker-service-url pulsar://broker.default.svc.cluster.local:6650/ +``` + +#### Deploy the rest of the components + +Once cluster metadata has been successfully initialized, you can then deploy the bookies, brokers, monitoring stack ([Prometheus](https://prometheus.io), [Grafana](https://grafana.com), and the [Pulsar dashboard](administration-dashboard.md)), and Pulsar cluster proxy: + +```bash +$ kubectl apply -f bookie.yaml +$ kubectl apply -f broker.yaml +$ kubectl apply -f monitoring.yaml +$ kubectl apply -f proxy.yaml +``` + +You can check on the status of the pods for these components either in the Kubernetes Dashboard or using `kubectl`: + +```bash +$ kubectl get pods -w -l app=pulsar +``` + +#### Set up properties and namespaces + +Once all of the components are up and running, you'll need to create at least one Pulsar tenant and at least one namespace. + +> This step is not strictly required if Pulsar [authentication and authorization](administration-auth.md) is turned on, though it allows you to change [policies](admin-api-namespaces.md) for each of the namespaces later. + +You can create properties and namespaces (and perform any other administrative tasks) using the `pulsar-admin` pod that is already configured to act as an admin client for your newly created Pulsar cluster. One easy way to perform administrative tasks is to create an alias for the [`pulsar-admin`](reference-pulsar-admin.md) tool installed on the admin pod. + +```bash +$ alias pulsar-admin='kubectl exec pulsar-admin -it -- bin/pulsar-admin' +``` + +Now, any time you run `pulsar-admin`, you will be running commands from that pod. This command will create a tenant called `ten`: + +```bash +$ pulsar-admin tenants create ten \ + --admin-roles admin \ + --allowed-clusters us-central +``` + +This command will create a `ns` namespace under the `ten` tenant: + +```bash +$ pulsar-admin namespaces create ten/ns +``` + +To verify that everything has gone as planned: + +```bash +$ pulsar-admin tenants list +ten + +$ pulsar-admin namespaces list ten +ns +``` + +Now that you have a namespace and property set up, you can move on to [experimenting with your Pulsar cluster](#experimenting-with-your-cluster) from within the cluster or [connecting to the cluster](#client-connections) using a Pulsar client. + +#### Experimenting with your cluster + +Now that a property and namespace have been created, you can begin experimenting with your running Pulsar cluster. Using the same `pulsar-admin` pod via an alias, as in the section above, you can use [`pulsar-perf`](reference-cli-tools.md#pulsar-perf) to create a test {% popover producer %} to publish 10,000 messages a second on a topic in the {% popover property %} and {% popover namespace %} you created. + +First, create an alias to use the `pulsar-perf` tool via the admin pod: + +```bash +$ alias pulsar-perf='kubectl exec pulsar-admin -it -- bin/pulsar-perf' +``` + +Now, produce messages: + +```bash +$ pulsar-perf produce persistent://public/default/my-topic \ + --rate 10000 +``` + +Similarly, you can start a {% popover consumer %} to subscribe to and receive all the messages on that topic: + +```bash +$ pulsar-perf consume persistent://public/default/my-topic \ + --subscriber-name my-subscription-name +``` + +You can also view [stats](administration-stats.md) for the topic using the [`pulsar-admin`](reference-pulsar-admin.md#persistent-stats) tool: + +```bash +$ pulsar-admin persistent stats persistent://public/default/my-topic +``` + +### Monitoring + +The default monitoring stack for Pulsar on Kubernetes has consists of [Prometheus](#prometheus), [Grafana](#grafana), and the [Pulsar dashbaord](administration-dashboard.md). + +#### Prometheus + +All Pulsar metrics in Kubernetes are collected by a [Prometheus](https://prometheus.io) instance running inside the cluster. Typically, there is no need to access Prometheus directly. Instead, you can use the [Grafana interface](#grafana) that displays the data stored in Prometheus. + +#### Grafana + +In your Kubernetes cluster, you can use [Grafana](https://grafana.com) to view dashbaords for Pulsar {% popover namespaces %} (message rates, latency, and storage), JVM stats, {% popover ZooKeeper %}, and {% popover BookKeeper %}. You can get access to the pod serving Grafana using `kubectl`'s [`port-forward`](https://kubernetes.io/docs/tasks/access-application-cluster/port-forward-access-application-cluster) command: + +```bash +$ kubectl port-forward \ + $(kubectl get pods -l component=grafana -o jsonpath='{.items[*].metadata.name}') 3000 +``` + +You can then access the dashboard in your web browser at [localhost:3000](http://localhost:3000). + +#### Pulsar dashboard + +While Grafana and Prometheus are used to provide graphs with historical data, [Pulsar dashboard](administration-dashboard.md) reports more detailed current data for individual {% popover topics %}. + +For example, you can have sortable tables showing all namespaces, topics, and broker stats, with details on the IP address for consumers, how long they've been connected, and much more. + +You can access to the pod serving the Pulsar dashboard using `kubectl`'s [`port-forward`](https://kubernetes.io/docs/tasks/access-application-cluster/port-forward-access-application-cluster) command: + +```bash +$ kubectl port-forward \ + $(kubectl get pods -l component=dashboard -o jsonpath='{.items[*].metadata.name}') 8080:80 +``` + +You can then access the dashboard in your web browser at [localhost:8080](http://localhost:8080). + +### Client connections + +Once your Pulsar cluster is running on Kubernetes, you can connect to it using a Pulsar client. You can fetch the IP address for the Pulsar proxy running in your Kubernetes cluster using kubectl: + +```bash +$ kubectl get service broker-proxy \ + --output=jsonpath='{.status.loadBalancer.ingress[*].ip}' +``` + +If the IP address for the proxy were, for example, 35.12.13.198, you could connect to Pulsar using `pulsar://35.12.13.198:6650`. + +You can find client documentation for: + +* [Java](client-libraries-java.md) +* [Python](client-libraries-python.md) +* [C++](client-libraries-cpp.md) + + diff --git a/site2/docs/deploy-monitoring.md b/site2/docs/deploy-monitoring.md new file mode 100644 index 0000000000000000000000000000000000000000..989db8c6aaf5d3ebcf48be7765e95c8bc1b2035a --- /dev/null +++ b/site2/docs/deploy-monitoring.md @@ -0,0 +1,97 @@ +--- +id: deploy-monitoring +title: Monitoring +sidebar_label: Monitoring +--- + +There are different ways to monitor a Pulsar cluster, exposing both metrics relative to the usage of topics and the overall health of the individual components of the cluster. + +## Collecting metrics + +### Broker stats + +Pulsar broker metrics can be collected from brokers and exported in JSON format. There are two main types of metrics: + +* *Destination dumps*, which containing stats for each individual topic. They can be fetched using + + ```shell + bin/pulsar-admin broker-stats destinations + ``` + +* Broker metrics, containing broker info and topics stats aggregated at namespace + level: + + ```shell + bin/pulsar-admin broker-stats monitoring-metrics + ``` + +All the message rates are updated every 1min. + +The aggregated broker metrics are also exposed in the [Prometheus](https://prometheus.io) format at: + +```shell +http://$BROKER_ADDRESS:8080/metrics +``` + +### ZooKeeper stats + +The local/global ZooKeeper server and clients that are shipped with Pulsar have been instrumented to expose +detailed stats through Prometheus as well. + +```shell +http://$LOCAL_ZK_SERVER:8000/metrics +http://$GLOBAL_ZK_SERVER:8001/metrics +``` + +The default port of local ZooKeeper is `8000` and that of global ZooKeeper is `8001`. +These can be changed by specifying system property `stats_server_port`. + +### BookKeeper stats + +For BookKeeper you can configure the stats frameworks by changing the `statsProviderClass` in +`conf/bookkeeper.conf`. + +By default, the default BookKeeper configuration included with Pulsar distribution will enable +the Prometheus exporter. + +```shell +http://$BOOKIE_ADDRESS:8000/metrics +``` + +For bookies, the default port is `8000` (instead of `8080`) and that can be configured by changing +the `prometheusStatsHttpPort` in `conf/bookkeeper.conf`. + +## Configuring Prometheus + +You can configure Prometheus to collect and store the metrics data by following the Prometheus +[Getting started](https://prometheus.io/docs/introduction/getting_started/) guide. + +When running on bare metal, you can provide the list of nodes that needs to be probed. When deploying +in a Kubernetes cluster, the monitoring is automatically setup with the [provided](deploy-kubernetes.md) +instructions. + +## Dashboards + +When collecting time series statistics, the major problem is to make sure the number of dimensions +attached to the data does not explode. + +For that reason we only collect time series of metrics aggregated at the namespace level. + +### Pulsar per-topic dashboard + +The per-topic dashboard instructions are available at [Dashboard](administration-dashboard.md). + +### Grafana + +You can use grafana to easily create dashboard driven by the data stored in Prometheus. + +There is a `pulsar-grafana` Docker image that is ready to use with the principal dashboards already +in place. This is enabled by default when deploying Pulsar on Kubernetes. + +To use the dashboard manually: + +```shell +docker run -p3000:3000 \ + -e PROMETHEUS_URL=http://$PROMETHEUS_HOST:9090/ \ + apachepulsar/pulsar-grafana:latest +``` diff --git a/site2/docs/developing-binary-protocol.md b/site2/docs/developing-binary-protocol.md new file mode 100644 index 0000000000000000000000000000000000000000..dfe3365c7d00213ef367c771316ffd6d5922dd20 --- /dev/null +++ b/site2/docs/developing-binary-protocol.md @@ -0,0 +1,557 @@ +--- +id: develop-binary-protocol +title: Pulsar binary protocol specification +sidebar_label: Binary protocol +--- + +Pulsar uses a custom binary protocol for communications between producers/consumers and brokers. This protocol is designed to support required features, such as acknowledgements and flow control, while ensuring maximum transport and implementation efficiency. + +Clients and brokers exchange *commands* with each other. Commands are formatted as binary [protocol buffer](https://developers.google.com/protocol-buffers/) (aka *protobuf*) messages. The format of protobuf commands is specified in the [`PulsarApi.proto`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-common/src/main/proto/PulsarApi.proto) file and also documented in the [Protobuf interface](#protobuf-interface) section below. + +> ### Connection sharing +> Commands for different producers and consumers can be interleaved and sent through the same connection without restriction. + +All commands associated with Pulsar's protocol are contained in a +[`BaseCommand`](#pulsar.proto.BaseCommand) protobuf message that includes a [`Type`](#pulsar.proto.Type) [enum](https://developers.google.com/protocol-buffers/docs/proto#enum) with all possible subcommands as optional fields. `BaseCommand` messages can specify only one subcommand. + +## Framing + +Since protobuf doesn't provide any sort of message frame, all messages in the Pulsar protocol are prepended with a 4-byte field that specifies the size of the frame. The maximum allowable size of a single frame is 5 MB. + +The Pulsar protocol allows for two types of commands: + +1. **Simple commands** that do not carry a message payload. +2. **Payload commands** that bear a payload that is used when publishing or delivering messages. In payload commands, the protobuf command data is followed by protobuf [metadata](#message-metadata) and then the payload, which is passed in raw format outside of protobuf. All sizes are passed as 4-byte unsigned big endian integers. + +> Message payloads are passed in raw format rather than protobuf format for efficiency reasons. + +### Simple commands + +Simple (payload-free) commands have this basic structure: + +| Component | Description | Size (in bytes) | +|:------------|:----------------------------------------------------------------------------------------|:----------------| +| totalSize | The size of the frame, counting everything that comes after it (in bytes) | 4 | +| commandSize | The size of the protobuf-serialized command | 4 | +| message | The protobuf message serialized in a raw binary format (rather than in protobuf format) | | + +### Payload commands + +Payload commands have this basic structure: + +| Component | Description | Size (in bytes) | +|:-------------|:--------------------------------------------------------------------------------------------|:----------------| +| totalSize | The size of the frame, counting everything that comes after it (in bytes) | 4 | +| commandSize | The size of the protobuf-serialized command | 4 | +| message | The protobuf message serialized in a raw binary format (rather than in protobuf format) | | +| magicNumber | A 2-byte byte array (`0x0e01`) identifying the current format | 2 | +| checksum | A [CRC32-C checksum](http://www.evanjones.ca/crc32c.html) of everything that comes after it | 4 | +| metadataSize | The size of the message [metadata](#message-metadata) | 4 | +| metadata | The message [metadata](#message-metadata) stored as a binary protobuf message | | +| payload | Anything left in the frame is considered the payload and can include any sequence of bytes | | + +## Message metadata + +Message metadata is stored alongside the application-specified payload as a serialized protobuf message. Metadata is created by the producer and passed on unchanged to the consumer. + +| Field | Description | +|:-------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `producer_name` | The name of the producer that published the message | +| `sequence_id` | The sequence ID of the message, assigned by producer} | +| `publish_time` | The publish timestamp in Unix time (i.e. as the number of milliseconds since January 1st, 1970 in UTC) | +| `properties` | A sequence of key/value pairs (using the [`KeyValue`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-common/src/main/proto/PulsarApi.proto#L32) message). These are application-defined keys and values with no special meaning to Pulsar. | +| `replicated_from` *(optional)* | Indicates that the message has been replicated and specifies the name of the {% popover cluster %} where the message was originally published | +| `partition_key` *(optional)* | While publishing on a partition topic, if the key is present, the hash of the key is used to determine which partition to choose | +| `compression` *(optional)* | Signals that payload has been compressed and with which compression library | +| `uncompressed_size` *(optional)* | If compression is used, the producer must fill the uncompressed size field with the original payload size | +| `num_messages_in_batch` *(optional)* | If this message is really a [batch](#batch-messages) of multiple entries, this field must be set to the number of messages in the batch | + +### Batch messages + +When using batch messages, the payload will be containing a list of entries, +each of them with its individual metadata, defined by the `SingleMessageMetadata` +object. + + +For a single batch, the payload format will look like this: + + +| Field | Description | +|:--------------|:------------------------------------------------------------| +| metadataSizeN | The size of the single message metadata serialized Protobuf | +| metadataN | Single message metadata | +| payloadN | Message payload passed by application | + +Each metadata field looks like this; + +| Field | Description | +|:---------------------------|:--------------------------------------------------------| +| properties | Application-defined properties | +| partition key *(optional)* | Key to indicate the hashing to a particular partition | +| payload_size | Size of the payload for the single message in the batch | + +When compression is enabled, the whole batch will be compressed at once. + +## Interactions + +### Connection establishment + +After opening a TCP connection to a broker, typically on port 6650, the client +is responsible to initiate the session. + +![Connect interaction](/docs/assets/binary-protocol-connect.png) + +After receiving a `Connected` response from the broker, the client can +consider the connection ready to use. Alternatively, if the broker doesn't +validate the client authentication, it will reply with an `Error` command and +close the TCP connection. + +Example: + +```protobuf +message CommandConnect { + "client_version" : "Pulsar-Client-Java-v1.15.2", + "auth_method_name" : "my-authentication-plugin", + "auth_data" : "my-auth-data", + "protocol_version" : 6 +} +``` + +Fields: + * `client_version` → String based identifier. Format is not enforced + * `auth_method_name` → *(optional)* Name of the authentication plugin if auth + enabled + * `auth_data` → *(optional)* Plugin specific authentication data + * `protocol_version` → Indicates the protocol version supported by the + client. Broker will not send commands introduced in newer revisions of the + protocol. Broker might be enforcing a minimum version + +```protobuf +message CommandConnected { + "server_version" : "Pulsar-Broker-v1.15.2", + "protocol_version" : 6 +} +``` + +Fields: + * `server_version` → String identifier of broker version + * `protocol_version` → Protocol version supported by the broker. Client + must not attempt to send commands introduced in newer revisions of the + protocol + +### Keep Alive + +To identify prolonged network partitions between clients and brokers or cases +in which a machine crashes without interrupting the TCP connection on the remote +end (eg: power outage, kernel panic, hard reboot...), we have introduced a +mechanism to probe for the availability status of the remote peer. + +Both clients and brokers are sending `Ping` commands periodically and they will +close the socket if a `Pong` response is not received within a timeout (default +used by broker is 60s). + +A valid implementation of a Pulsar client is not required to send the `Ping` +probe, though it is required to promptly reply after receiving one from the +broker in order to prevent the remote side from forcibly closing the TCP connection. + + +### Producer + +In order to send messages, a client needs to establish a producer. When creating +a producer, the broker will first verify that this particular client is +authorized to publish on the topic. + +Once the client gets confirmation of the producer creation, it can publish +messages to the broker, referring to the producer id negotiated before. + +![Producer interaction](/docs/assets/binary-protocol-producer.png) + +##### Command Producer + +```protobuf +message CommandProducer { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "producer_id" : 1, + "request_id" : 1 +} +``` + +Parameters: + * `topic` → Complete topic name to where you want to create the producer on + * `producer_id` → Client generated producer identifier. Needs to be unique + within the same connection + * `request_id` → Identifier for this request. Used to match the response with + the originating request. Needs to be unique within the same connection + * `producer_name` → *(optional)* If a producer name is specified, the name will + be used, otherwise the broker will generate a unique name. Generated + producer name is guaranteed to be globally unique. Implementations are + expected to let the broker generate a new producer name when the producer + is initially created, then reuse it when recreating the producer after + reconnections. + +The broker will reply with either `ProducerSuccess` or `Error` commands. + +##### Command ProducerSuccess + +```protobuf +message CommandProducerSuccess { + "request_id" : 1, + "producer_name" : "generated-unique-producer-name" +} +``` + +Parameters: + * `request_id` → Original id of the `CreateProducer` request + * `producer_name` → Generated globally unique producer name or the name + specified by the client, if any. + +##### Command Send + +Command `Send` is used to publish a new message within the context of an +already existing producer. This command is used in a frame that includes command +as well as message payload, for which the complete format is specified in the +[payload commands](#payload-commands) section. + +```protobuf +message CommandSend { + "producer_id" : 1, + "sequence_id" : 0, + "num_messages" : 1 +} +``` + +Parameters: + * `producer_id` → id of an existing producer + * `sequence_id` → each message has an associated sequence id which is expected + to be implemented with a counter starting at 0. The `SendReceipt` that + acknowledges the effective publishing of a messages will refer to it by + its sequence id. + * `num_messages` → *(optional)* Used when publishing a batch of messages at + once. + +##### Command SendReceipt + +After a message has been persisted on the configured number of replicas, the +broker will send the acknowledgment receipt to the producer. + + +```protobuf +message CommandSendReceipt { + "producer_id" : 1, + "sequence_id" : 0, + "message_id" : { + "ledgerId" : 123, + "entryId" : 456 + } +} +``` + +Parameters: + * `producer_id` → id of producer originating the send request + * `sequence_id` → sequence id of the published message + * `message_id` → message id assigned by the system to the published message + Unique within a single cluster. Message id is composed of 2 longs, `ledgerId` + and `entryId`, that reflect that this unique id is assigned when appending + to a BookKeeper ledger + + +##### Command CloseProducer + +**Note**: *This command can be sent by either producer or broker*. + +When receiving a `CloseProducer` command, the broker will stop accepting any +more messages for the producer, wait until all pending messages are persisted +and then reply `Success` to the client. + +The broker can send a `CloseProducer` command to client when it's performing +a graceful failover (eg: broker is being restarted, or the topic is being unloaded +by load balancer to be transferred to a different broker). + +When receiving the `CloseProducer`, the client is expected to go through the +service discovery lookup again and recreate the producer again. The TCP +connection is not affected. + +### Consumer + +A consumer is used to attach to a subscription and consume messages from it. +After every reconnection, a client needs to subscribe to the topic. If a +subscription is not already there, a new one will be created. + +![Consumer](/docs/assets/binary-protocol-consumer.png) + +#### Flow control + +After the consumer is ready, the client needs to *give permission* to the +broker to push messages. This is done with the `Flow` command. + +A `Flow` command gives additional *permits* to send messages to the consumer. +A typical consumer implementation will use a queue to accumulate these messages +before the application is ready to consume them. + +After the application has dequeued a number of message, the consumer will +send additional number of permits to allow the broker to push more messages. + +##### Command Subscribe + +```protobuf +message CommandSubscribe { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "subscription" : "my-subscription-name", + "subType" : "Exclusive", + "consumer_id" : 1, + "request_id" : 1 +} +``` + +Parameters: + * `topic` → Complete topic name to where you want to create the consumer on + * `subscription` → Subscription name + * `subType` → Subscription type: Exclusive, Shared, Failover + * `consumer_id` → Client generated consumer identifier. Needs to be unique + within the same connection + * `request_id` → Identifier for this request. Used to match the response with + the originating request. Needs to be unique within the same connection + * `consumer_name` → *(optional)* Clients can specify a consumer name. This + name can be used to track a particular consumer in the stats. Also, in + Failover subscription type, the name is used to decide which consumer is + elected as *master* (the one receiving messages): consumers are sorted by + their consumer name and the first one is elected master. + +##### Command Flow + +```protobuf +message CommandFlow { + "consumer_id" : 1, + "messagePermits" : 1000 +} +``` + +Parameters: +* `consumer_id` → Id of an already established consumer +* `messagePermits` → Number of additional permits to grant to the broker for + pushing more messages + +##### Command Message + +Command `Message` is used by the broker to push messages to an existing consumer, +within the limits of the given permits. + + +This command is used in a frame that includes the message payload as well, for +which the complete format is specified in the [payload commands](#payload-commands) +section. + +```protobuf +message CommandMessage { + "consumer_id" : 1, + "message_id" : { + "ledgerId" : 123, + "entryId" : 456 + } +} +``` + + +##### Command Ack + +An `Ack` is used to signal to the broker that a given message has been +successfully processed by the application and can be discarded by the broker. + +In addition, the broker will also maintain the consumer position based on the +acknowledged messages. + +```protobuf +message CommandAck { + "consumer_id" : 1, + "ack_type" : "Individual", + "message_id" : { + "ledgerId" : 123, + "entryId" : 456 + } +} +``` + +Parameters: + * `consumer_id` → Id of an already established consumer + * `ack_type` → Type of acknowledgment: `Individual` or `Cumulative` + * `message_id` → Id of the message to acknowledge + * `validation_error` → *(optional)* Indicates that the consumer has discarded + the messages due to: `UncompressedSizeCorruption`, + `DecompressionError`, `ChecksumMismatch`, `BatchDeSerializeError` + +##### Command CloseConsumer + +***Note***: *This command can be sent by either producer or broker*. + +This command behaves the same as [`CloseProducer`](#command-closeproducer) + +##### Command RedeliverUnacknowledgedMessages + +A consumer can ask the broker to redeliver some or all of the pending messages +that were pushed to that particular consumer and not yet acknowledged. + +The protobuf object accepts a list of message ids that the consumer wants to +be redelivered. If the list is empty, the broker will redeliver all the +pending messages. + +On redelivery, messages can be sent to the same consumer or, in the case of a +shared subscription, spread across all available consumers. + + +##### Command ReachedEndOfTopic + +This is sent by a broker to a particular consumer, whenever the topic +has been "terminated" and all the messages on the subscription were +acknowledged. + +The client should use this command to notify the application that no more +messages are coming from the consumer. + +##### Command ConsumerStats + +This command is sent by the client to retreive Subscriber and Consumer level +stats from the broker. +Parameters: + * `request_id` → Id of the request, used to correlate the request + and the response. + * `consumer_id` → Id of an already established consumer. + +##### Command ConsumerStatsResponse + +This is the broker's response to ConsumerStats request by the client. +It contains the Subscriber and Consumer level stats of the `consumer_id` sent in the request. +If the `error_code` or the `error_message` field is set it indicates that the request has failed. + +##### Command Unsubscribe + +This command is sent by the client to unsubscribe the `consumer_id` from the associated topic. +Parameters: + * `request_id` → Id of the request. + * `consumer_id` → Id of an already established consumer which needs to unsubscribe. + + +## Service discovery + +### Topic lookup + +Topic lookup needs to be performed each time a client needs to create or +reconnect a producer or a consumer. Lookup is used to discover which particular +broker is serving the topic we are about to use. + +Lookup can be done with a REST call as described in the +[admin API](admin-api-persistent-topics.md#lookup-of-topic) +docs. + +Since Pulsar-1.16 it is also possible to perform the lookup within the binary +protocol. + +For the sake of example, let's assume we have a service discovery component +running at `pulsar://broker.example.com:6650` + +Individual brokers will be running at `pulsar://broker-1.example.com:6650`, +`pulsar://broker-2.example.com:6650`, ... + +A client can use a connection to the discovery service host to issue a +`LookupTopic` command. The response can either be a broker hostname to +connect to, or a broker hostname to which retry the lookup. + +The `LookupTopic` command has to be used in a connection that has already +gone through the `Connect` / `Connected` initial handshake. + +![Topic lookup](/docs/assets/binary-protocol-topic-lookup.png) + +```protobuf +message CommandLookupTopic { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "request_id" : 1, + "authoritative" : false +} +``` + +Fields: + * `topic` → Topic name to lookup + * `request_id` → Id of the request that will be passed with its response + * `authoritative` → Initial lookup request should use false. When following a + redirect response, client should pass the same value contained in the + response + +##### LookupTopicResponse + +Example of response with successful lookup: + +```protobuf +message CommandLookupTopicResponse { + "request_id" : 1, + "response" : "Connect", + "brokerServiceUrl" : "pulsar://broker-1.example.com:6650", + "brokerServiceUrlTls" : "pulsar+ssl://broker-1.example.com:6651", + "authoritative" : true +} +``` + +Example of lookup response with redirection: + +```protobuf +message CommandLookupTopicResponse { + "request_id" : 1, + "response" : "Redirect", + "brokerServiceUrl" : "pulsar://broker-2.example.com:6650", + "brokerServiceUrlTls" : "pulsar+ssl://broker-2.example.com:6651", + "authoritative" : true +} +``` + +In this second case, we need to reissue the `LookupTopic` command request +to `broker-2.example.com` and this broker will be able to give a definitive +answer to the lookup request. + +### Partitioned topics discovery + +Partitioned topics metadata discovery is used to find out if a topic is a +"partitioned topic" and how many partitions were set up. + +If the topic is marked as "partitioned", the client is expected to create +multiple producers or consumers, one for each partition, using the `partition-X` +suffix. + +This information only needs to be retrieved the first time a producer or +consumer is created. There is no need to do this after reconnections. + +The discovery of partitioned topics metadata works very similar to the topic +lookup. The client send a request to the service discovery address and the +response will contain actual metadata. + +##### Command PartitionedTopicMetadata + +```protobuf +message CommandPartitionedTopicMetadata { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "request_id" : 1 +} +``` + +Fields: + * `topic` → the topic for which to check the partitions metadata + * `request_id` → Id of the request that will be passed with its response + + +##### Command PartitionedTopicMetadataResponse + +Example of response with metadata: + +```protobuf +message CommandPartitionedTopicMetadataResponse { + "request_id" : 1, + "response" : "Success", + "partitions" : 32 +} +``` + +## Protobuf interface + +{% include protobuf.html %} + + + + + diff --git a/site2/docs/developing-codebase.md b/site2/docs/developing-codebase.md new file mode 100644 index 0000000000000000000000000000000000000000..ce69eb42b88a785113eadeb9a853ea4f3ddf8adc --- /dev/null +++ b/site2/docs/developing-codebase.md @@ -0,0 +1,10 @@ +--- +id: develop-codebase +title: The Pulsar codebase +sidebar_label: Codebase +--- + +The panels below describe some of the core directories in the [Pulsar codebase](https://github.com/apache/incubator-pulsar). + +{% include codebase.html %} + diff --git a/site2/docs/developing-cpp.md b/site2/docs/developing-cpp.md new file mode 100644 index 0000000000000000000000000000000000000000..47bbb37b8e41d7ade5ace4f31c00ab7a2e2a28aa --- /dev/null +++ b/site2/docs/developing-cpp.md @@ -0,0 +1,100 @@ +--- +id: develop-cpp +title: Building Pulsar C++ client +sidebar_label: Building Pulsar C++ client +--- + +## Supported platforms + +The Pulsar C++ client has been successfully tested on **MacOS** and **Linux**. + +## System requirements + +You need to have the following installed to use the C++ client: + +* [CMake](https://cmake.org/) +* [Boost](http://www.boost.org/) +* [Protocol Buffers](https://developers.google.com/protocol-buffers/) 2.6 +* [Log4CXX](https://logging.apache.org/log4cxx) +* [libcurl](https://curl.haxx.se/libcurl/) +* [Google Test](https://github.com/google/googletest) +* [JsonCpp](https://github.com/open-source-parsers/jsoncpp) + +## Compilation + +There are separate compilation instructions for [MacOS](#macos) and [Linux](#linux). For both systems, start by cloning the Pulsar repository: + +```shell +$ git clone https://github.com/apache/incubator-pulsar +``` + +### Linux + +First, install all of the necessary dependencies: + +```shell +$ apt-get install cmake libssl-dev libcurl4-openssl-dev liblog4cxx-dev \ + libprotobuf-dev libboost-all-dev google-mock libgtest-dev libjsoncpp-dev +``` + +Then compile and install [Google Test](https://github.com/google/googletest): + +```shell +# libgtest-dev version is 1.18.0 or above +$ cd /usr/src/googletest +$ sudo cmake . +$ sudo make +$ sudo cp ./googlemock/libgmock.a ./googlemock/gtest/libgtest.a /usr/lib/ + +# less than 1.18.0 +$ cd /usr/src/gtest +$ sudo cmake . +$ sudo make +$ sudo cp libgtest.a /usr/lib + +$ cd /usr/src/gmock +$ sudo cmake . +$ sudo make +$ sudo cp libgmock.a /usr/lib +``` + +Finally, compile the Pulsar client library for C++ inside the Pulsar repo: + +```shell +$ cd pulsar-client-cpp +$ cmake . +$ make +``` + +The resulting files, `libpulsar.so` and `libpulsar.a`, will be placed in the `lib` folder of the repo while two tools, `perfProducer` and `perfConsumer`, will be placed in the `perf` directory. + +### MacOS + +First, install all of the necessary dependencies: + +```shell +# OpenSSL installation +$ brew install openssl +$ export OPENSSL_INCLUDE_DIR=/usr/local/opt/openssl/include/ +$ export OPENSSL_ROOT_DIR=/usr/local/opt/openssl/ + +# Protocol Buffers installation +$ brew tap homebrew/versions +$ brew install protobuf260 +$ brew install boost +$ brew install log4cxx + +# Google Test installation +$ git clone https://github.com/google/googletest.git +$ cd googletest +$ cmake . +$ make install +``` + +Then compile the Pulsar client library in the repo that you cloned: + +```shell +$ cd pulsar-client-cpp +$ cmake . +$ make +``` diff --git a/site2/docs/developing-load-manager.md b/site2/docs/developing-load-manager.md new file mode 100644 index 0000000000000000000000000000000000000000..f545e5afa97942606f29a618f31da9eba6bb728c --- /dev/null +++ b/site2/docs/developing-load-manager.md @@ -0,0 +1,214 @@ +--- +id: develop-load-manager +title: Modular load manager +sidebar_label: Modular load manager +--- + +The *modular load manager*, implemented in [`ModularLoadManagerImpl`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/ModularLoadManagerImpl.java), is a flexible alternative to the previously implemented load manager, [`SimpleLoadManagerImpl`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/SimpleLoadManagerImpl.java), which attempts to simplify how load is managed while also providing abstractions so that complex load management strategies may be implemented. + +## Usage + +There are two ways that you can enable the modular load manager: + +1. Change the value of the `loadManagerClassName` parameter in `conf/broker.conf` from `org.apache.pulsar.broker.loadbalance.impl.SimpleLoadManagerImpl` to `org.apache.pulsar.broker.loadbalance.impl.ModularLoadManagerImpl`. +2. Using the `pulsar-admin` tool. Here's an example: + + ```shell + $ pulsar-admin update-dynamic-config \ + --config loadManagerClassName \ + --value org.apache.pulsar.broker.loadbalance.impl.ModularLoadManagerImpl + ``` + + You can use the same method to change back to the original value. In either case, any mistake in specifying the load manager will cause Pulsar to default to `SimpleLoadManagerImpl`. + +## Verification + +There are a few different ways to determine which load manager is being used: + +1. Use `pulsar-admin` to examine the `loadManagerClassName` element: + + ```shell + $ bin/pulsar-admin brokers get-all-dynamic-config + { + "loadManagerClassName" : "org.apache.pulsar.broker.loadbalance.impl.ModularLoadManagerImpl" + } + ``` + + If there is no `loadManagerClassName` element, then the default load manager is used. + +2. Consult a ZooKeeper load report. With the module load manager, the load report in `/loadbalance/brokers/...` will have many differences. for example the `systemResourceUsage` sub-elements (`bandwidthIn`, `bandwidthOut`, etc.) are now all at the top level. Here is an example load report from the module load manager: + + ```json + { + "bandwidthIn": { + "limit": 10240000.0, + "usage": 4.256510416666667 + }, + "bandwidthOut": { + "limit": 10240000.0, + "usage": 5.287239583333333 + }, + "bundles": [], + "cpu": { + "limit": 2400.0, + "usage": 5.7353247655435915 + }, + "directMemory": { + "limit": 16384.0, + "usage": 1.0 + } + } + ``` + + With the simple load manager, the load report in `/loadbalance/brokers/...` will look like this: + + ```json + { + "systemResourceUsage": { + "bandwidthIn": { + "limit": 10240000.0, + "usage": 0.0 + }, + "bandwidthOut": { + "limit": 10240000.0, + "usage": 0.0 + }, + "cpu": { + "limit": 2400.0, + "usage": 0.0 + }, + "directMemory": { + "limit": 16384.0, + "usage": 1.0 + }, + "memory": { + "limit": 8192.0, + "usage": 3903.0 + } + } + } + ``` + +3. The command-line [broker monitor](reference-cli-tools.md#monitor-brokers) will have a different output format depending on which load manager implementation is being used. + + Here is an example from the modular load manager: + + ``` + =================================================================================================================== + ||SYSTEM |CPU % |MEMORY % |DIRECT % |BW IN % |BW OUT % |MAX % || + || |0.00 |48.33 |0.01 |0.00 |0.00 |48.33 || + ||COUNT |TOPIC |BUNDLE |PRODUCER |CONSUMER |BUNDLE + |BUNDLE - || + || |4 |4 |0 |2 |4 |0 || + ||LATEST |MSG/S IN |MSG/S OUT |TOTAL |KB/S IN |KB/S OUT |TOTAL || + || |0.00 |0.00 |0.00 |0.00 |0.00 |0.00 || + ||SHORT |MSG/S IN |MSG/S OUT |TOTAL |KB/S IN |KB/S OUT |TOTAL || + || |0.00 |0.00 |0.00 |0.00 |0.00 |0.00 || + ||LONG |MSG/S IN |MSG/S OUT |TOTAL |KB/S IN |KB/S OUT |TOTAL || + || |0.00 |0.00 |0.00 |0.00 |0.00 |0.00 || + =================================================================================================================== + ``` + + Here is an example from the simple load manager: + + ``` + =================================================================================================================== + ||COUNT |TOPIC |BUNDLE |PRODUCER |CONSUMER |BUNDLE + |BUNDLE - || + || |4 |4 |0 |2 |0 |0 || + ||RAW SYSTEM |CPU % |MEMORY % |DIRECT % |BW IN % |BW OUT % |MAX % || + || |0.25 |47.94 |0.01 |0.00 |0.00 |47.94 || + ||ALLOC SYSTEM |CPU % |MEMORY % |DIRECT % |BW IN % |BW OUT % |MAX % || + || |0.20 |1.89 | |1.27 |3.21 |3.21 || + ||RAW MSG |MSG/S IN |MSG/S OUT |TOTAL |KB/S IN |KB/S OUT |TOTAL || + || |0.00 |0.00 |0.00 |0.01 |0.01 |0.01 || + ||ALLOC MSG |MSG/S IN |MSG/S OUT |TOTAL |KB/S IN |KB/S OUT |TOTAL || + || |54.84 |134.48 |189.31 |126.54 |320.96 |447.50 || + =================================================================================================================== + ``` + +It is important to note that the module load manager is _centralized_, meaning that all requests to assign a bundle---whether it's been seen before or whether this is the first time---only get handled by the _lead_ broker (which can change over time). To determine the current lead broker, examine the `/loadbalance/leader` node in ZooKeeper. + +## Implementation + +### Data + +The data monitored by the modular load manager is contained in the [`LoadData`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/LoadData.java) class. +Here, the available data is subdivided into the bundle data and the broker data. + +#### Broker + +The broker data is contained in the [`BrokerData`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/BrokerData.java) class. It is further subdivided into two parts, +one being the local data which every broker individually writes to ZooKeeper, and the other being the historical broker +data which is written to ZooKeeper by the leader broker. + +##### Local Broker Data +The local broker data is contained in the class [`LocalBrokerData`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-common/src/main/java/org/apache/pulsar/policies/data/loadbalancer/LocalBrokerData.java) and provides information about the following resources: + +* CPU usage +* JVM heap memory usage +* Direct memory usage +* Bandwidth in/out usage +* Most recent total message rate in/out across all bundles +* Total number of topics, bundles, producers, and consumers +* Names of all bundles assigned to this broker +* Most recent changes in bundle assignments for this broker + +The local broker data is updated periodically according to the service configuration +"loadBalancerReportUpdateMaxIntervalMinutes". After any broker updates their local broker data, the leader broker will +receive the update immediately via a ZooKeeper watch, where the local data is read from the ZooKeeper node +`/loadbalance/brokers/` + +##### Historical Broker Data + +The historical broker data is contained in the [`TimeAverageBrokerData`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/TimeAverageBrokerData.java) class. + +In order to reconcile the need to make good decisions in a steady-state scenario and make reactive decisions in a critical scenario, the historical data is split into two parts: the short-term data for reactive decisions, and the long-term data for steady-state decisions. Both time frames maintain the following information: + +* Message rate in/out for the entire broker +* Message throughput in/out for the entire broker + +Unlike the bundle data, the broker data does not maintain samples for the global broker message rates and throughputs, which is not expected to remain steady as new bundles are removed or added. Instead, this data is aggregated over the short-term and long-term data for the bundles. See the section on bundle data to understand how that data is collected and maintained. + +The historical broker data is updated for each broker in memory by the leader broker whenever any broker writes their local data to ZooKeeper. Then, the historical data is written to ZooKeeper by the leader broker periodically according to the configuration `loadBalancerResourceQuotaUpdateIntervalMinutes`. + +##### Bundle Data + +The bundle data is contained in the [`BundleData`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/BundleData.java). Like the historical broker data, the bundle data is split into a short-term and a long-term time frame. The information maintained in each time frame: + +* Message rate in/out for this bundle +* Message Throughput In/Out for this bundle +* Current number of samples for this bundle + +The time frames are implemented by maintaining the average of these values over a set, limited number of samples, where +the samples are obtained through the message rate and throughput values in the local data. Thus, if the update interval +for the local data is 2 minutes, the number of short samples is 10 and the number of long samples is 1000, the +short-term data is maintained over a period of `10 samples * 2 minutes / sample = 20 minutes`, while the long-term +data is similarly over a period of 2000 minutes. Whenever there are not enough samples to satisfy a given time frame, +the average is taken only over the existing samples. When no samples are available, default values are assumed until +they are overwritten by the first sample. Currently, the default values are + +* Message rate in/out: 50 messages per second both ways +* Message throughput in/out: 50KB per second both ways + +The bundle data is updated in memory on the leader broker whenever any broker writes their local data to ZooKeeper. +Then, the bundle data is written to ZooKeeper by the leader broker periodically at the same time as the historical +broker data, according to the configuration `loadBalancerResourceQuotaUpdateIntervalMinutes`. + +### Traffic Distribution + +The modular load manager uses the abstraction provided by [`ModularLoadManagerStrategy`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/ModularLoadManagerStrategy.java) to make decisions about bundle assignment. The strategy makes a decision by considering the service configuration, the entire load data, and the bundle data for the bundle to be assigned. Currently, the only supported strategy is [`LeastLongTermMessageRate`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/LeastLongTermMessageRate.java), though soon users will have the ability to inject their own strategies if desired. + +#### Least Long Term Message Rate Strategy + +As its name suggests, the least long term message rate strategy attempts to distribute bundles across brokers so that +the message rate in the long-term time window for each broker is roughly the same. However, simply balancing load based +on message rate does not handle the issue of asymmetric resource burden per message on each broker. Thus, the system +resource usages, which are CPU, memory, direct memory, bandwidth in, and bandwidth out, are also considered in the +assignment process. This is done by weighting the final message rate according to +`1 / (overload_threshold - max_usage)`, where `overload_threshold` corresponds to the configuration +`loadBalancerBrokerOverloadedThresholdPercentage` and `max_usage` is the maximum proportion among the system resources +that is being utilized by the candidate broker. This multiplier ensures that machines with are being more heavily taxed +by the same message rates will receive less load. In particular, it tries to ensure that if one machine is overloaded, +then all machines are approximately overloaded. In the case in which a broker's max usage exceeds the overload +threshold, that broker is not considered for bundle assignment. If all brokers are overloaded, the bundle is randomly +assigned. + diff --git a/site2/docs/developing-schema.md b/site2/docs/developing-schema.md new file mode 100644 index 0000000000000000000000000000000000000000..d7aee264ded89dc264365a401ae02b6c7004102e --- /dev/null +++ b/site2/docs/developing-schema.md @@ -0,0 +1,57 @@ +--- +id: develop-schema +title: Custom schema storage +sidebar_label: Custom schema storage +--- + +By default, Pulsar stores data type [schemas](getting-started-concepts-and-architecture.md#schema-registry) in [Apache BookKeeper](https://bookkeeper.apache.org) (which is deployed alongside Pulsar). You can, however, use another storage system if you wish. This doc walks you through creating your own schema storage implementation. + +In order to use a non-default (i.e. non-BookKeeper) storage system for Pulsar schemas, you need to implement two Java interfaces: [`SchemaStorage`](#schemastorage-interface) and [`SchemaStorageFactory`](#schemastoragefactory-interface). + +## SchemaStorage interface + +The `SchemaStorage` interface has the following methods: + +```java +public interface SchemaStorage { + // How schemas are updated + CompletableFuture put(String key, byte[] value, byte[] hash); + + // How schemas are fetched from storage + CompletableFuture get(String key, SchemaVersion version); + + // How schemas are deleted + CompletableFuture delete(String key); + + // Utility method for converting a schema version byte array to a SchemaVersion object + SchemaVersion versionFromBytes(byte[] version); + + // Startup behavior for the schema storage client + void start() throws Exception; + + // Shutdown behavior for the schema storage client + void close() throws Exception; +} +``` + +> For a full-fledged example schema storage implementation, see the [`BookKeeperSchemaStorage`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/service/schema/BookkeeperSchemaStorage.java) class. + +## SchemaStorageFactory interface + +```java +public interface SchemaStorageFactory { + @NotNull + SchemaStorage create(PulsarService pulsar) throws Exception; +} +``` + +> For a full-fledged example schema storage factory implementation, see the [`BookKeeperSchemaStorageFactory`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/service/schema/BookkeeperSchemaStorageFactory.java) class. + +## Deployment + +In order to use your custom schema storage implementation, you'll need to: + +1. Package the implementation in a [JAR](https://docs.oracle.com/javase/tutorial/deployment/jar/basicsindex.html) file. +1. Add that jar to the `lib` folder in your Pulsar [binary or source distribution](getting-started-standalone.md#installing-pulsar). +1. Change the `schemaRegistryStorageClassName` configuration in [`broker.conf`](reference-configuration.md#broker) to your custom factory class (i.e. the `SchemaStorageFactory` implementation, not the `SchemaStorage` implementation). +1. Start up Pulsar. diff --git a/site2/docs/developing-tools.md b/site2/docs/developing-tools.md new file mode 100644 index 0000000000000000000000000000000000000000..99d472d666009ecddd8a8c200840f0f897c1ea4b --- /dev/null +++ b/site2/docs/developing-tools.md @@ -0,0 +1,105 @@ +--- +id: develop-tools +title: Simulation tools +sidebar_label: Simulation tools +--- + +It is sometimes necessary create an test environment and incur artificial load to observe how well load managers +handle the load. The load simulation controller, the load simulation client, and the broker monitor were created as an +effort to make create this load and observe the effects on the managers more easily. + +## Simulation Client +The simulation client is a machine which will create and subscribe to topics with configurable message rates and sizes. +Because it is sometimes necessary in simulating large load to use multiple client machines, the user does not interact +with the simulation client directly, but instead delegates their requests to the simulation controller, which will then +send signals to clients to start incurring load. The client implementation is in the class +`org.apache.pulsar.testclient.LoadSimulationClient`. + +### Usage +To Start a simulation client, use the `pulsar-perf` script with the command `simulation-client` as follows: + +``` +pulsar-perf simulation-client --port --service-url +``` + +The client will then be ready to receive controller commands. +## Simulation Controller +The simulation controller send signals to the simulation clients, requesting them to create new topics, stop old +topics, change the load incurred by topics, as well as several other tasks. It is implemented in the class +`org.apache.pulsar.testclient.LoadSimulationController` and presents a shell to the user as an interface to send +command with. + +### Usage +To start a simulation controller, use the `pulsar-perf` script with the command `simulation-controller` as follows: + +``` +pulsar-perf simulation-controller --cluster --client-port +--clients +``` + +The clients should already be started before the controller is started. You will then be presented with a simple prompt, +where you can issue commands to simulation clients. Arguments often refer to tenant names, namespace names, and topic +names. In all cases, the BASE name of the tenants, namespaces, and topics are used. For example, for the topic +`persistent://my_tenant/my_cluster/my_namespace/my_topic`, the tenant name is `my_tenant`, the namespace name is +`my_namespace`, and the topic name is `my_topic`. The controller can perform the following actions: + +* Create a topic with a producer and a consumer + * `trade [--rate ] + [--rand-rate ,] + [--size ]` +* Create a group of topics with a producer and a consumer + * `trade_group [--rate ] + [--rand-rate ,] + [--separation ] [--size ] + [--topics-per-namespace ]` +* Change the configuration of an existing topic + * `change [--rate ] + [--rand-rate ,] + [--size ]` +* Change the configuration of a group of topics + * `change_group [--rate ] [--rand-rate ,] + [--size ] [--topics-per-namespace ]` +* Shutdown a previously created topic + * `stop ` +* Shutdown a previously created group of topics + * `stop_group ` +* Copy the historical data from one ZooKeeper to another and simulate based on the message rates and sizes in that +history + * `copy [--rate-multiplier value]` +* Simulate the load of the historical data on the current ZooKeeper (should be same ZooKeeper being simulated on) + * `simulate [--rate-multiplier value]` +* Stream the latest data from the given active ZooKeeper to simulate the real-time load of that ZooKeeper. + * `stream [--rate-multiplier value]` + +The "group" arguments in these commands allow the user to create or affect multiple topics at once. Groups are created +when calling the `trade_group` command, and all topics from these groups may be subsequently modified or stopped +with the `change_group` and `stop_group` commands respectively. All ZooKeeper arguments are of the form +`zookeeper_host:port`. + +### Difference Between Copy, Simulate, and Stream +The commands `copy`, `simulate`, and `stream` are very similar but have significant differences. `copy` is used when +you want to simulate the load of a static, external ZooKeeper on the ZooKeeper you are simulating on. Thus, +`source zookeeper` should be the ZooKeeper you want to copy and `target zookeeper` should be the ZooKeeper you are +simulating on, and then it will get the full benefit of the historical data of the source in both load manager +implementations. `simulate` on the other hand takes in only one ZooKeeper, the one you are simulating on. It assumes +that you are simulating on a ZooKeeper that has historical data for `SimpleLoadManagerImpl` and creates equivalent +historical data for `ModularLoadManagerImpl`. Then, the load according to the historical data is simulated by the +clients. Finally, `stream` takes in an active ZooKeeper different than the ZooKeeper being simulated on and streams +load data from it and simulates the real-time load. In all cases, the optional `rate-multiplier` argument allows the +user to simulate some proportion of the load. For instance, using `--rate-multiplier 0.05` will cause messages to +be sent at only `5%` of the rate of the load that is being simulated. + +## Broker Monitor +To observe the behavior of the load manager in these simulations, one may utilize the broker monitor, which is +implemented in `org.apache.pulsar.testclient.BrokerMonitor`. The broker monitor will print tabular load data to the +console as it is updated using watchers. + +### Usage +To start a broker monitor, use the `monitor-brokers` command in the `pulsar-perf` script: + +``` +pulsar-perf monitor-brokers --connect-string +``` + +The console will then continuously print load data until it is interrupted. + diff --git a/site2/docs/functions-api.md b/site2/docs/functions-api.md new file mode 100644 index 0000000000000000000000000000000000000000..c6730707672c2bdc3e354a818288ad247c47084d --- /dev/null +++ b/site2/docs/functions-api.md @@ -0,0 +1,712 @@ +--- +id: functions-api +title: The Pulsar Functions API +sidebar_label: API +--- + +[Pulsar Functions](functions-overview.md) provides an easy-to-use API that developers can use to create and manage processing logic for the Apache Pulsar messaging system. With Pulsar Functions, you can write functions of any level of complexity in [Java](#java) or [Python](#python) and run them in conjunction with a Pulsar cluster without needing to run a separate stream processing engine. + +> For a more in-depth overview of the Pulsar Functions feature, see the [Pulsar Functions overview](functions-overview.md). + +## Core programming model + +Pulsar Functions provide a wide range of functionality but are based on a very simple programming model. You can think of Pulsar Functions as lightweight processes that + +* consume messages from one or more Pulsar topics and then +* apply some user-defined processing logic to each incoming message. That processing logic could be just about anything you want, including + * producing the resulting, processed message on another Pulsar topic, or + * doing something else with the message, such as writing results to an external database. + +You could use Pulsar Functions, for example, to set up the following processing chain: + +* A [Python](#python) function listens on the `raw-sentences` topic and "[sanitizes](#example-function)" incoming strings (removing extraneous whitespace and converting all characters to lower case) and then publishes the results to a `sanitized-sentences` topic +* A [Java](#java) function listens on the `sanitized-sentences` topic, counts the number of times each word appears within a specified time window, and publishes the results to a `results` topic +* Finally, a Python function listens on the `results` topic and writes the results to a MySQL table + +### Example function + +Here's an example "input sanitizer" function written in Python and stored in a `sanitizer.py` file: + +```python +def clean_string(s): + return s.strip().lower() + +def process(input): + return clean_string(input) +``` + +Some things to note about this Pulsar Function: + +* There is no client, producer, or consumer object involved. All message "plumbing" is already taken care of for you, enabling you to worry only about processing logic. +* No topics, subscription types, tenants, or namespaces are specified in the function logic itself. Instead, topics are specified upon [deployment](#example-deployment). This means that you can use and re-use Pulsar Functions across topics, tenants, and namespaces without needing to hard-code those attributes. + +### Example deployment + +Deploying Pulsar Functions is handled by the [`pulsar-admin`](reference-pulsar-admin.md) CLI tool, in particular the [`functions`](reference-pulsar-admin.md#functions) command. Here's an example command that would run our [sanitizer](#example-function) function from above in [local run](functions-deploying.md#local-run) mode: + +```bash +$ bin/pulsar-admin functions localrun \ + --py sanitizer.py \ # The Python file with the function's code + --className sanitizer \ # The class or function holding the processing logic + --tenant public \ # The function's tenant (derived from the topic name by default) + --namespace default \ # The function's namespace (derived from the topic name by default) + --name sanitizer-function \ # The name of the function (the class name by default) + --inputs dirty-strings-in \ # The input topic(s) for the function + --output clean-strings-out \ # The output topic for the function + --logTopic sanitizer-logs # The topic to which all functions logs are published +``` + +For instructions on running functions in your Pulsar cluster, see the [Deploying Pulsar Functions](functions-deploying.md) guide. + +### Available APIs + +In both Java and Python, you have two options for writing Pulsar Functions: + +Interface | Description | Use cases +:---------|:------------|:--------- +Language-native interface | No Pulsar-specific libraries or special dependencies required (only core libraries from Java/Python) | Functions that don't require access to the function's [context](#context) +Pulsar Function SDK for Java/Python | Pulsar-specific libraries that provide a range of functionality not provided by "native" interfaces | Functions that require access to the function's [context](#context) + +In Python, for example, this language-native function, which adds an exclamation point to all incoming strings and publishes the resulting string to a topic, would have no external dependencies: + +```python +def process(input): + return "{}!".format(input) +``` + +This function, however, would use the Pulsar Functions [SDK for Python](#python-sdk): + +```python +from pulsar import Function + +class DisplayFunctionName(Function): + def process(self, input, context): + function_name = context.function_name() + return "The function processing this message has the name {0}".format(function_name) +``` + +### Serialization and deserialization (SerDe) + +SerDe stands for **Ser**ialization and **De**serialization. All Pulsar Functions use SerDe for message handling. How SerDe works by default depends on the language you're using for a particular function: + +* In [Python](#python-serde), the default SerDe is identity, meaning that the type is serialized as whatever type the producer function returns +* In [Java](#java-serde), a number of commonly used types (`String`s, `Integer`s, etc.) are supported by default + +In both languages, however, you can write your own custom SerDe logic for more complex, application-specific types. See the docs for [Java](#java-serde) and [Python](#python-serde) for language-specific instructions. + +### Context + +Both the [Java](#java-sdk) and [Python](#python-sdk) SDKs provide access to a **context object** that can be used by the function. This context object provides a wide variety of information and functionality to the function: + +* The name and ID of the Pulsar Function +* The message ID of each message. Each Pulsar message is automatically assigned an ID. +* The name of the topic on which the message was sent +* The names of all input topics as well as the output topic associated with the function +* The name of the class used for [SerDe](#serde) +* The {% popover tenant %} and namespace associated with the function +* The ID of the Pulsar Functions instance running the function +* The version of the function +* The [logger object](#logging) used by the function, which can be used to create function log messages +* Access to arbitrary [user config](#user-config) values supplied via the CLI +* An interface for recording [metrics](functions-metrics.md) + +### User config + +When you run or update Pulsar Functions created using the [SDK](#apis), you can pass arbitrary key/values to them via the command line with the `--userConfig` flag. Key/values must be specified as JSON. Here's an example of a function creation command that passes a user config key/value to a function: + +```bash +$ bin/pulsar-admin functions create \ + --name word-filter \ + # Other function configs + --userConfig '{"forbidden-word":"rosebud"}' +``` + +If the function were a Python function, that config value could be accessed like this: + +```python +from pulsar import Function + +class WordFilter(Function): + def process(self, context, input): + forbidden_word = context.user_config()["forbidden-word"] + + # Don't publish the message if it contains the user-supplied + # forbidden word + if forbidden_word in input: + pass + # Otherwise publish the message + else: + return input +``` + +## Functions for Java + +Writing Pulsar Functions in Java involves implementing one of two interfaces: + +* The [`java.util.Function`](https://docs.oracle.com/javase/8/docs/api/java/util/function/Function.html) interface +* The {% javadoc Function client org.apache.pulsar.functions.api.Function %} interface. This interface works much like the `java.util.Function` interface, but with the important difference that it provides a {% javadoc Context client org.apache.pulsar.functions.api.Context %} object that you can use in a [variety of ways](#context) + +### Getting started + +In order to write Pulsar Functions in Java, you'll need to install the proper [dependencies](#java-dependencies) and package your function [as a JAR](#java-packaging). + +#### Dependencies + +How you get started writing Pulsar Functions in Java depends on which API you're using: + +* If you're writing a [Java native function](#java-native), you won't need any external dependencies. +* If you're writing a [Java SDK function](#java-sdk), you'll need to import the `pulsar-functions-api` library. + + Here's an example for a Maven `pom.xml` configuration file: + + ```xml + + org.apache.pulsar + pulsar-functions-api + 2.0.0-incubating-SNAPSHOT + + ``` + + Here's an example for a Gradle `build.gradle` configuration file: + + ```groovy + dependencies { + compile group: 'org.apache.pulsar', name: 'pulsar-functions-api', version: '2.0.0-incubating-SNAPSHOT' + } + ``` + +#### Packaging + +Whether you're writing Java Pulsar Functions using the [native](#java-native) Java `java.util.Function` interface or using the [Java SDK](#java-sdk), you'll need to package your function(s) as a "fat" JAR. + +> #### Starter repo +> If you'd like to get up and running quickly, you can use [this repo](https://github.com/streamlio/pulsar-functions-java-starter), which contains the necessary Maven configuration to build a fat JAR as well as some example functions. + +### Java native functions + +If your function doesn't require access to its [context](#java-context), you can create a Pulsar Function by implementing the [`java.util.Function`](https://docs.oracle.com/javase/8/docs/api/java/util/function/Function.html) interface, which has this very simple, single-method signature: + +```java +public interface Function { + O apply(I input); +} +``` + +Here's an example function that takes a string as its input, adds an exclamation point to the end of the string, and then publishes the resulting string: + +```java +import java.util.Function; + +public class ExclamationFunction implements Function { + @Override + public String process(String input) { + return String.format("%s!", input); + } +} +``` + +In general, you should use native functions when you don't need access to the function's [context](#context). If you *do* need access to the function's context, then we recommend using the [Pulsar Functions Java SDK](#java-sdk). + +#### Java native examples + +There is one example Java native function in [this folder](https://github.com/apache/incubator-pulsar/tree/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples): + +* [`JavaNativeExclmationFunction`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/JavaNativeExclmationFunction.java) + +### Java SDK functions + +To get started developing Pulsar Functions using the Java SDK, you'll need to add a dependency on the `pulsar-functions-api` artifact to your project. Instructions can be found [above](#java-dependencies). + +> An easy way to get up and running with Pulsar Functions in Java is to clone the [`pulsar-functions-java-starter`](https://github.com/streamlio/pulsar-functions-java-starter) repo and follow the instructions there. + + +#### Java SDK examples + +There are several example Java SDK functions in [this folder](https://github.com/apache/incubator-pulsar/tree/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples): + +Function name | Description +:-------------|:----------- +[`ContextFunction`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/ContextFunction.java) | Illustrates [context](#context)-specific functionality like [logging](#java-logging) and [metrics](#java-metrics) +[`CounterFunction`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/CounterFunction.java) | Illustrates usage of Pulsar Function [counters](functions-overview.md#counters) +[`ExclamationFunction`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/ExclamationFunction.java) | A basic string manipulation function for the Java SDK +[`LoggingFunction`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/LoggingFunction.java) | A function that shows how [logging](#java-logging) works for Java +[`PublishFunction`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/PublishFunction.java) | Publishes results to a topic specified in the function's [user config](#java-user-config) (rather than on the function's output topic) +[`UserConfigFunction`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/UserConfigFunction.java) | A function that consumes [user-supplied configuration](#java-user-config) values +[`UserMetricFunction`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/UserMetricFunction.java) | A function that records metrics +[`VoidFunction`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/UserMetricFunction.java) | A simple [void function](#void-functions) + +### Java context object + +The {% javadoc Context client org.apache.pulsar.functions.api.Context %} interface provides a number of methods that you can use to access the function's [context](#context). The various method signatures for the `Context` interface are listed below: + +```java +public interface Context { + byte[] getMessageId(); + String getTopicName(); + Collection getSourceTopics(); + String getSinkTopic(); + String getOutputSerdeClassName(); + String getTenant(); + String getNamespace(); + String getFunctionName(); + String getFunctionId(); + String getInstanceId(); + String getFunctionVersion(); + Logger getLogger(); + Map getUserConfigMap(); + Optional getUserConfigValue(String key); + String getUserConfigValueOrDefault(String key, String default); + void recordMetric(String metricName, double value); + CompletableFuture publish(String topicName, O object, String serDeClassName); + CompletableFuture publish(String topicName, O object); + CompletableFuture ack(byte[] messageId, String topic); +} +``` + +Here's an example function that uses several methods available via the `Context` object: + +```java +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +import java.util.stream.Collectors; + +public class ContextFunction implements Function { + public Void process(String input, Context context) { + Logger LOG = context.getLogger(); + String inputTopics = context.getInputTopics().stream().collect(Collectors.joining(", ")); + String functionName = context.getFunctionName(); + + String logMessage = String.format("A message with a value of \"%s\" has arrived on one of the following topics: %s\n", + input, + inputTopics); + + LOG.info(logMessage); + + String metricName = String.format("function-%s-messages-received", functionName); + context.recordMetric(metricName, 1); + + return null; + } +} +``` + +### Void functions + +Pulsar Functions can publish results to an output topic, but this isn't required. You can also have functions that simply produce a log, write results to a database, etc. Here's a function that writes a simple log every time a message is received: + +```java +import org.slf4j.Logger; + +public class LogFunction implements PulsarFunction { + public String apply(String input, Context context) { + Logger LOG = context.getLogger(); + LOG.info("The following message was received: {}", input); + return null; + } +} +``` + +> When using Java functions in which the output type is `Void`, the function must *always* return `null`. + +### Java SerDe + +Pulsar Functions use [SerDe](#serde) when publishing data to and consuming data from Pulsar topics. When you're writing Pulsar Functions in Java, the following basic Java types are built in and supported by default: + +* `String` +* `Double` +* `Integer` +* `Float` +* `Long` +* `Short` +* `Byte` + +Built-in vs. custom. For custom, you need to implement this interface: + +```java +public interface SerDe { + T deserialize(byte[] input); + byte[] serialize(T input); +} +``` + +#### Java SerDe example + +Imagine that you're writing Pulsar Functions in Java that are processing tweet objects. Here's a simple example `Tweet` class: + +```java +public class Tweet { + private String username; + private String tweetContent; + + public Tweet(String username, String tweetContent) { + this.username = username; + this.tweetContent = tweetContent; + } + + // Standard setters and getters +} +``` + +In order to be able to pass `Tweet` objects directly between Pulsar Functions, you'll need to provide a custom SerDe class. In the example below, `Tweet` objects are basically strings in which the username and tweet content are separated by a `|`. + +```java +package com.example.serde; + +import org.apache.pulsar.functions.api.SerDe; + +import java.util.regex.Pattern; + +public class TweetSerde implements SerDe { + public Tweet deserialize(byte[] input) { + String s = new String(input); + String[] fields = s.split(Pattern.quote("|")); + return new Tweet(fields[0], fields[1]); + } + + public byte[] serialize(Tweet input) { + return "%s|%s".format(input.getUsername(), input.getTweetContent()).getBytes(); + } +} +``` + +To apply this custom SerDe to a particular Pulsar Function, you would need to: + +* Package the `Tweet` and `TweetSerde` classes into a JAR +* Specify a path to the JAR and SerDe class name when deploying the function + +Here's an example [`create`](reference-pulsar-admin.md#pulsar-admin-functions-create) operation: + +```bash +$ bin/pulsar-admin functions create \ + --jar /path/to/your.jar \ + --outputSerdeClassName com.example.serde.TweetSerde \ + # Other function attributes +``` + +> #### Custom SerDe classes must be packaged with your function JARs +> Pulsar does not store your custom SerDe classes separately from your Pulsar Functions. That means that you'll need to always include your SerDe classes in your function JARs. If not, Pulsar will return an error. + +### Java logging + +Pulsar Functions that use the [Java SDK](#java-sdk) have access to an [SLF4j](https://www.slf4j.org/) [`Logger`](https://www.slf4j.org/api/org/apache/log4j/Logger.html) object that can be used to produce logs at the chosen log level. Here's a simple example function that logs either a `WARNING`- or `INFO`-level log based on whether the incoming string contains the word `danger`: + +```java +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +public class LoggingFunction implements Function { + @Override + public void apply(String input, Context context) { + Logger LOG = context.getLogger(); + String messageId = new String(context.getMessageId()); + + if (input.contains("danger")) { + LOG.warn("A warning was received in message {}", messageId); + } else { + LOG.info("Message {} received\nContent: {}", messageId, input); + } + + return null; + } +} +``` + +If you want your function to produce logs, you need to specify a log topic when creating or running the function. Here's an example: + +```bash +$ bin/pulsar-admin functions create \ + --jar my-functions.jar \ + --className my.package.LoggingFunction \ + --logTopic persistent://public/default/logging-function-logs \ + # Other function configs +``` + +Now, all logs produced by the `LoggingFunction` above can be accessed via the `persistent://public/default/logging-function-logs` topic. + +### Java user config + +The Java SDK's [`Context`](#java-context) object enables you to access key/value pairs provided to the Pulsar Function via the command line (as JSON). Here's an example function creation command that passes a key/value pair: + +```bash +$ bin/pulsar-admin functions create \ + # Other function configs + --userConfig '{"word-of-the-day":"verdure"}' +``` + +To access that value in a Java function: + +```java +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +import java.util.Optional; + +public class UserConfigFunction implements Function { + @Override + public void apply(String input, Context context) { + Logger LOG = context.getLogger(); + Optional wotd = context.getUserConfigValue("word-of-the-day"); + if (wotd.isPresent()) { + LOG.info("The word of the day is {}", wotd); + } else { + LOG.warn("No word of the day provided"); + } + return null; + } +} +``` + +The `UserConfigFunction` function will log the string `"The word of the day is verdure"` every time the function is invoked (i.e. every time a message arrives). The `word-of-the-day` user config will be changed only when the function is updated with a new config value via the command line. + +You can also access the entire user config map or set a default value in case no value is present: + +```java +// Get the whole config map +Map allConfigs = context.getUserConfigMap(); + +// Get value or resort to default +String wotd = context.getUserConfigValueOrDefault("word-of-the-day", "perspicacious"); +``` + +> For all key/value pairs passed to Java Pulsar Functions, both the key *and* the value are `String`s. If you'd like the value to be of a different type, you will need to deserialize from the `String` type. + +### Java metrics + +You can record metrics using the [`Context`](#java-context) object on a per-key basis. You can, for example, set a metric for the key `process-count` and a different metric for the key `elevens-count` every time the function processes a message. Here's an example: + +```java +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +public class MetricRecorderFunction implements Function { + @Override + public void apply(Integer input, Context context) { + // Records the metric 1 every time a message arrives + context.recordMetric("hit-count", 1); + + // Records the metric only if the arriving number equals 11 + if (input == 11) { + context.recordMetric("elevens-count", 1); + } + + return null; + } +} +``` + +> For instructions on reading and using metrics, see the [Monitoring](deploy-monitoring.md) guide. + + +## Functions for Python + +Writing Pulsar Functions in Python entails implementing one of two things: + +* A `process` function that takes an input (message data from the function's input topic(s)), applies some kind of logic to it, and either returns an object (to be published to the function's output topic) or `pass`es and thus doesn't produce a message +* A `Function` class that has a `process` method that provides a message input to process and a [context](#python-context) object + +### Getting started + +Regardless of which [deployment mode](functions-deploying.md) you're using, you'll need to install the following Python libraries on any machine that's running Pulsar Functions written in Python: + +* pulsar-client +* protobuf +* futures +* grpcio +* grpcio-tools + +That could be your local machine for [local run mode](functions-deploying.md#local-run) or a machine running a Pulsar {% popover broker %} for [cluster mode](functions-deploying.md#cluster-mode). To install those libraries using pip: + +```bash +$ pip install pulsar-client protobuf futures grpcio grpcio-tools +``` + +### Packaging + +At the moment, the code for Pulsar Functions written in Python must be contained within a single Python file. In the future, Pulsar Functions may support other packaging formats, such as [**P**ython **EX**ecutables](https://github.com/pantsbuild/pex) (PEXes). + +### Python native functions + +If your function doesn't require access to its [context](#context), you can create a Pulsar Function by implementing a `process` function, which provides a single input object that you can process however you wish. Here's an example function that takes a string as its input, adds an exclamation point at the end of the string, and then publishes the resulting string: + +```python +def process(input): + return "{0}!".format(input) +``` + +In general, you should use native functions when you don't need access to the function's [context](#context). If you *do* need access to the function's context, then we recommend using the [Pulsar Functions Python SDK](#python-sdk). + +#### Python native examples + +There is one example Python native function in [this folder](https://github.com/apache/incubator-pulsar/tree/master/pulsar-functions/python-examples): + +* [`native_exclamation_function.py`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-functions/python-examples/native_exclamation_function.py) + +### Python SDK functions + +To get started developing Pulsar Functions using the Python SDK, you'll need to install the [`pulsar-client`](/api/python) library using the instructions [above](#getting-started). + +#### Python SDK examples + +There are several example Python functions in [this folder](https://github.com/apache/incubator-pulsar/blob/master/pulsar-functions/python-examples): + +Function file | Description +:-------------|:----------- +[`exclamation_function.py`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-functions/python-examples/exclamation_function.py) | Adds an exclamation point at the end of each incoming string +[`logging_function.py`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-functions/python-examples/logging_function.py) | Logs each incoming message +[`thumbnailer.py`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-functions/python-examples/thumbnailer.py) | Takes image data as input and outputs a 128x128 thumbnail of each image + +#### Python context object + +The [`Context`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/context.py) class provides a number of methods that you can use to access the function's [context](#context). The various methods for the `Context` class are listed below: + +Method | What it provides +:------|:---------------- +`get_message_id` | The message ID of the message being processed +`get_topic_name` | The input topic of the message being processed +`get_function_name` | The name of the current Pulsar Function +`get_function_id` | The ID of the current Pulsar Function +`get_instance_id` | The ID of the current Pulsar Functions instance +`get_function_version` | The version of the current Pulsar Function +`get_logger` | A logger object that can be used for [logging](#python-logging) +`get_user_config_value` | Returns the value of a [user-defined config](#python-user-config) (or `None` if the config doesn't exist) +`get_user_config_map` | Returns the entire user-defined config as a dict +`record_metric` | Records a per-key [metric](#python-metrics) +`publish` | Publishes a message to the specified Pulsar topic +`get_output_serde_class_name` | The name of the output [SerDe](#python-serde) class +`ack` | {% popover Acks %} the message being processed to Pulsar + +### Python SerDe + +Pulsar Functions use [SerDe](#serde) when publishing data to and consuming data from Pulsar topics (this is true of both [native](#python-native) functions and [SDK](#python-sdk) functions). You can specify the SerDe when [creating](functions-deploying.md#cluster-mode) or [running](functions-deploying.md#local-run) functions. Here's an example: + +```bash +$ bin/pulsar-admin functions create \ + --tenant public \ + --namespace default \ + --name my_function \ + --py my_function.py \ + --className my_function.MyFunction \ + --customSerdeInputs '{"input-topic-1":"Serde1","input-topic-2":"Serde2"}' \ + --outputSerdeClassName Serde3 \ + --output output-topic-1 +``` + +In this case, there are two input topics, `input-topic-1` and `input-topic-2`, each of which is mapped to a different SerDe class (the map must be specified as a JSON string). The output topic, `output-topic-1`, uses the `Serde3` class for SerDe. At the moment, all Pulsar Function logic, include processing function and SerDe classes, must be contained within a single Python file. + +When using Pulsar Functions for Python, you essentially have three SerDe options: + +1. You can use the [`IdentitySerde`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L70), which leaves the data unchanged. The `IdentitySerDe` is the **default**. Creating or running a function without explicitly specifying SerDe will mean that this option is used. +2. You can use the [`PickeSerDe`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L62), which uses Python's [`pickle`](https://docs.python.org/3/library/pickle.html) for SerDe. +3. You can create a custom SerDe class by implementing the baseline [`SerDe`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L50) class, which has just two methods: [`serialize`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L53) for converting the object into bytes, and [`deserialize`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L58) for converting bytes into an object of the required application-specific type. + +The table below shows when you should use each SerDe: + +SerDe option | When to use +:------------|:----------- +`IdentitySerde` | When you're working with simple types like strings, Booleans, integers, and the like +`PickleSerDe` | When you're working with complex, application-specific types and are comfortable with `pickle`'s "best effort" approach +Custom SerDe | When you require explicit control over SerDe, potentially for performance or data compatibility purposes + +#### Python SerDe example + +Imagine that you're writing Pulsar Functions in Python that are processing tweet objects. Here's a simple `Tweet` class: + +```python +class Tweet(object): + def __init__(self, username, tweet_content): + self.username = username + self.tweet_content = tweet_content +``` + +In order to use this class in Pulsar Functions, you'd have two options: + +1. You could specify `PickleSerDe`, which would apply the [`pickle`](https://docs.python.org/3/library/pickle.html) library's SerDe +1. You could create your own SerDe class. Here's a simple example: + + ```python + from pulsar import SerDe + + class TweetSerDe(SerDe): + def __init__(self, tweet): + self.tweet = tweet + + def serialize(self, input): + return bytes("{0}|{1}".format(self.tweet.username, self.tweet.tweet_content)) + + def deserialize(self, input_bytes): + tweet_components = str(input_bytes).split('|') + return Tweet(tweet_components[0], tweet_componentsp[1]) + ``` + +### Python logging + +Pulsar Functions that use the [Python SDK](#python-sdk) have access to a logging object that can be used to produce logs at the chosen log level. Here's a simple example function that logs either a `WARNING`- or `INFO`-level log based on whether the incoming string contains the word `danger`: + +```python +from pulsar import Function + +class LoggingFunction(Function): + def process(self, input, context): + logger = context.get_logger() + msg_id = context.get_message_id() + if 'danger' in input: + logger.warn("A warning was received in message {0}".format(context.get_message_id())) + else: + logger.info("Message {0} received\nContent: {1}".format(msg_id, input)) +``` + +If you want your function to produce logs on a Pulsar topic, you need to specify a **log topic** when creating or running the function. Here's an example: + +```bash +$ bin/pulsar-admin functions create \ + --py logging_function.py \ + --className logging_function.LoggingFunction \ + --logTopic logging-function-logs \ + # Other function configs +``` + +Now, all logs produced by the `LoggingFunction` above can be accessed via the `logging-function-logs` topic. + +### Python user config + +The Python SDK's [`Context`](#python-context) object enables you to access key/value pairs provided to the Pulsar Function via the command line (as JSON). Here's an example function creation command that passes a key/value pair: + +```bash +$ bin/pulsar-admin functions create \ + # Other function configs \ + --userConfig '{"word-of-the-day":"verdure"}' +``` + +To access that value in a Python function: + +```python +from pulsar import Function + +class UserConfigFunction(Function): + def process(self, input, context): + logger = context.get_logger() + wotd = context.get_user_config_value('word-of-the-day') + if wotd is None: + logger.warn('No word of the day provided') + else: + logger.info("The word of the day is {0}".format(wotd)) +``` + +### Python metrics + +You can record metrics using the [`Context`](#python-context) object on a per-key basis. You can, for example, set a metric for the key `process-count` and a different metric for the key `elevens-count` every time the function processes a message. Here's an example: + +```python +from pulsar import Function + +class MetricRecorderFunction(Function): + def process(self, input, context): + context.record_metric('hit-count', 1) + + if input == 11: + context.record_metric('elevens-count', 1) +``` diff --git a/site2/docs/functions-deploying.md b/site2/docs/functions-deploying.md new file mode 100644 index 0000000000000000000000000000000000000000..82c4228b7c17a6d8d971f27b7beb653f0f7f976d --- /dev/null +++ b/site2/docs/functions-deploying.md @@ -0,0 +1,238 @@ +--- +id: functions-deploying +title: Deploying and managing Pulsar Functions +sidebar_label: Deploying functions +--- + +At the moment, there are two deployment modes available for Pulsar Functions: + +Mode | Description +:----|:----------- +Local run mode | The function runs in your local environment, for example on your laptop +Cluster mode | The function runs *inside of* your Pulsar cluster, on the same machines as your Pulsar brokers + +> #### Contributing new deployment modes +> The Pulsar Functions feature was designed, however, with extensibility in mind. Other deployment options will be available in the future. If you'd like to add a new deployment option, we recommend getting in touch with the Pulsar developer community at [dev@pulsar.incubator.apache.org](mailto:dev@pulsar.incubator.apache.org). + +## Requirements + +In order to deploy and manage Pulsar Functions, you need to have a Pulsar cluster running. There are several options for this: + +* You can run a [standalone cluster](getting-started-standalone.md) locally on your own machine +* You can deploy a Pulsar cluster on [Kubernetes](deploy-kubernetes.md), [Amazon Web Services](deploy-aws.md), [bare metal](deploy-bare-metal.md), [DC/OS](deploy-dcos.md), and more + +If you're running a non-{% popover standalone %} cluster, you'll need to obtain the service URL for the cluster. How you obtain the service URL will depend on how you deployed your Pulsar cluster. + +## Command-line interface + +Pulsar Functions are deployed and managed using the [`pulsar-admin functions`](reference-pulsar-admin.md#functions) interface, which contains commands such as [`create`](reference-pulsar-admin.md#functions-create) for deploying functions in [cluster mode](#cluster-mode), [`trigger`](reference-pulsar-admin.md#functions-trigger) for [triggering](#triggering) functions, [`list`](reference-pulsar-admin.md#functions-list) for listing deployed functions, and several others. + +### Fully Qualified Function Name (FQFN) + +Each Pulsar Function has a **Fully Qualified Function Name** (FQFN) that consists of three elements: the function's tenant, namespace, and function name. FQFN's look like this: + +```http +tenant/namespace/name +``` + +FQFNs enable you to, for example, create multiple functions with the same name provided that they're in different namespaces. + +### Default arguments + +When managing Pulsar Functions, you'll need to specify a variety of information about those functions, including tenant, namespace, input and output topics, etc. There are some parameters, however, that have default values that will be supplied if omitted. The table below lists the defaults: + +Parameter | Default +:---------|:------- +Function name | Whichever value is specified for the class name (minus org, library, etc.). The flag `--className org.example.MyFunction`, for example, would give the function a name of `MyFunction`. +Tenant | Derived from the input topics' names. If the input topics are under the `marketing` tenant---i.e. the topic names have the form `persistent://marketing/{namespace}/{topicName}`---then the tenant will be `marketing`. +Namespace | Derived from the input topics' names. If the input topics are under the `asia` namespace under the `marketing` tenant---i.e. the topic names have the form `persistent://marketing/asia/{topicName}`, then the namespace will be `asia`. +Output topic | `{input topic}-{function name}-output`. A function with an input topic name of `incoming` and a function name of `exclamation`, for example, would have an output topic of `incoming-exclamation-output`. +Subscription type | For at-least-once and at-most-once [processing guarantees](functions-gaurantees.md), the [`SHARED`](getting-started-concepts-and-architecture.md#shared) is applied by default; for effectively-once guarantees, [`FAILOVER`](getting-started-concepts-and-architecture.md#failover) is applied +Processing guarantees | [`ATLEAST_ONCE`](functions-gaurantees.md) +Pulsar service URL | `pulsar://localhost:6650` + +#### Example use of defaults + +Take this `create` command: + +```bash +$ bin/pulsar-admin functions create \ + --jar my-pulsar-functions.jar \ + --className org.example.MyFunction \ + --inputs my-function-input-topic1,my-function-input-topic2 +``` + +The created function would have default values supplied for the function name (`MyFunction`), tenant (`public`), namespace (`default`), subscription type (`SHARED`), processing guarantees (`ATLEAST_ONCE`), and Pulsar service URL (`pulsar://localhost:6650`). + +## Local run mode + +If you run a Pulsar Function in **local run** mode, it will run on the machine from which the command is run (this could be your laptop, an [AWS EC2](https://aws.amazon.com/ec2/) instance, etc.). Here's an example [`localrun`](reference-pulsar-admin.md#functions-localrun) command: + +```bash +$ bin/pulsar-admin functions localrun \ + --py myfunc.py \ + --className myfunc.SomeFunction \ + --inputs persistent://public/default/input-1 \ + --output persistent://public/default/output-1 +``` + +By default, the function will connect to a Pulsar cluster running on the same machine, via a local {% popover broker %} service URL of `pulsar://localhost:6650`. If you'd like to use local run mode to run a function but connect it to a non-local Pulsar cluster, you can specify a different broker URL using the `--brokerServiceUrl` flag. Here's an example: + +```bash +$ bin/pulsar-admin functions localrun \ + --brokerServiceUrl pulsar://my-cluster-host:6650 \ + # Other function parameters +``` + +## Cluster mode + +When you run a Pulsar Function in **cluster mode**, the function code will be uploaded to a Pulsar broker and run *alongside the broker* rather than in your [local environment](#local-run). You can run a function in cluster mode using the [`create`](reference-pulsar-admin.md#functions-create) command. Here's an example: + +```bash +$ bin/pulsar-admin functions create \ + --py myfunc.py \ + --className myfunc.SomeFunction \ + --inputs persistent://public/default/input-1 \ + --output persistent://public/default/output-1 +``` + +### Updating cluster mode functions + +You can use the [`update`](reference-pulsar-admin.md#functions-update) command to update a Pulsar Function running in cluster mode. This command, for example, would update the function created in the section [above](#cluster-mode): + +```bash +$ bin/pulsar-admin functions update \ + --py myfunc.py \ + --className myfunc.SomeFunction \ + --inputs persistent://public/default/new-input-topic \ + --output persistent://public/default/new-output-topic +``` + +### Parallelism + +Pulsar Functions run as processes called **instances**. When you run a Pulsar Function, it runs as a single instance by default (and in [local run mode](#local-run) you can *only* run a single instance of a function). + +You can also specify the *parallelism* of a function, i.e. the number of instances to run, when you create the function. You can set the parallelism factor using the `--parallelism` flag of the [`create`](reference-pulsar-admin.md#functions-create) command. Here's an example: + +```bash +$ bin/pulsar-admin functions create \ + --parallelism 3 \ + # Other function info +``` + +You can adjust the parallelism of an already created function using the [`update`](reference-pulsar-admin.md#functions-update) interface. + +```bash +$ bin/pulsar-admin functions update \ + --parallelism 5 \ + # Other function +``` + +If you're specifying a function's configuration via YAML, use the `parallelism` parameter. Here's an example config file: + +```yaml +# function-config.yaml +parallelism: 3 +inputs: +- persistent://public/default/input-1 +output: persistent://public/default/output-1 +# other parameters +``` + +And here's the corresponding update command: + +```bash +$ bin/pulsar-admin functions update \ + --functionConfigFile function-config.yaml +``` + +### Function instance resources + +When you run Pulsar Functions in [cluster run](#cluster-run) mode, you can specify the resources that are assigned to each function [instance](#parallelism): + +Resource | Specified as... | Runtimes +:--------|:----------------|:-------- +CPU | The number of cores | Docker (coming soon) +RAM | The number of bytes | Process, Docker +Disk space | The number of bytes | Docker + +Here's an example function creation command that allocates 8 cores, 8 GB of RAM, and 10 GB of disk space to a function: + +```bash +$ bin/pulsar-admin functions create \ + --jar target/my-functions.jar \ + --className org.example.functions.MyFunction \ + --cpu 8 \ + --ram 8589934592 \ + --disk 10737418240 +``` + +> #### Resources are *per instance* +> The resources that you apply to a given Pulsar Function are applied to each [instance](#parallelism) of the function. If you apply 8 GB of RAM to a function with a paralellism of 5, for example, then you are applying 40 GB of RAM total for the function. You should always make sure to factor paralellism---i.e. the number of instances---into your resource calculations + +## Triggering Pulsar Functions + +If a Pulsar Function is running in [cluster mode](#cluster-mode), you can **trigger** it at any time using the command line. Triggering a function means that you send a message with a specific value to the function and get the function's output (if any) via the command line. + +> Triggering a function is ultimately no different from invoking a function by producing a message on one of the function's input topics. The [`pulsar-admin functions trigger`](reference-pulsar-admin.md#functions-trigger) command is essentially a convenient mechanism for sending messages to functions without needing to use the [`pulsar-client`](reference-cli-tools.md#pulsar-client) tool or a language-specific client library. + +To show an example of function triggering, let's start with a simple [Python function](functions-api.md#python) that returns a simple string based on the input: + +```python +# myfunc.py +def process(input): + return "This function has been triggered with a value of {0}".format(input) +``` + +Let's run that function in [local run mode](functions-deploying.md#local-run): + +```bash +$ bin/pulsar-admin functions create \ + --tenant public \ + --namespace default \ + --name myfunc \ + --py myfunc.py \ + --className myfunc \ + --inputs persistent://public/default/in \ + --output persistent://public/default/out +``` + +Now let's make a consumer listen on the output topic for messages coming from the `myfunc` function using the [`pulsar-client consume`](reference-cli-tools.md#pulsar-client-consume) command: + +```bash +$ bin/pulsar-client consume persistent://public/default/out \ + --subscription-name my-subscription + --num-messages 0 # Listen indefinitely +``` + +Now let's trigger that function: + +```bash +$ bin/pulsar-admin functions trigger \ + --tenant public \ + --namespace default \ + --name myfunc \ + --triggerValue "hello world" +``` + +The consumer listening on the output topic should then produce this in its logs: + +``` +----- got message ----- +This function has been triggered with a value of hello world +``` + +> #### Topic info not required +> In the `trigger` command above, you may have noticed that you only need to specify basic information about the function (tenant, namespace, and name). To trigger the function, you didn't need to know the function's input topic(s). + + diff --git a/site2/docs/functions-gaurantees.md b/site2/docs/functions-gaurantees.md new file mode 100644 index 0000000000000000000000000000000000000000..a32efb10b4b59ee0eb2c36e538a6151fc252a1a4 --- /dev/null +++ b/site2/docs/functions-gaurantees.md @@ -0,0 +1,41 @@ +--- +id: functions-guarantees +title: Processing guarantees +sidebar_label: Processing guarantees +--- + +Pulsar Functions provides three different messaging semantics that you can apply to any function: + +Delivery semantics | Description +:------------------|:------- +**At-most-once** delivery | Each message that is sent to the function will most likely be processed but also may not be (hence the "at most") +**At-least-once** delivery | Each message that is sent to the function could be processed more than once (hence the "at least") +**Effectively-once** delivery | Each message that is sent to the function will have one output associated with it + +## Applying processing guarantees to a function + +You can set the processing guarantees for a Pulsar Function when you create the Function. This [`pulsar-function create`](reference-pulsar-admin.md#pulsar-admin-functions-create) command, for example, would apply effectively-once guarantees to the Function: + +```bash +$ bin/pulsar-admin functions create \ + --processingGuarantees EFFECTIVELY_ONCE \ + # Other function configs +``` + +The available options are: + +* `ATMOST_ONCE` +* `ATLEAST_ONCE` +* `EFFECTIVELY_ONCE` + +> By default, Pulsar Functions provide at-most-once delivery guarantees. So if you create a function without supplying a value for the `--processingGuarantees` flag, then the function will provide at-most-once guarantees. + +## Updating the processing guarantees of a function + +You can change the processing guarantees applied to a function once it's already been created using the [`update`](reference-pulsar-admin.md#pulsar-admin-functions-update) command. Here's an example: + +```bash +$ bin/pulsar-admin functions update \ + --processingGuarantees ATMOST_ONCE \ + # Other function configs +``` diff --git a/site2/docs/functions-metrics.md b/site2/docs/functions-metrics.md new file mode 100644 index 0000000000000000000000000000000000000000..c445bd39707c549a02392cd9fcdf1d38ae37d290 --- /dev/null +++ b/site2/docs/functions-metrics.md @@ -0,0 +1,43 @@ +--- +id: functions-metrics +title: Metrics for Pulsar Functions +sidebar_label: Metrics +--- + +Pulsar Functions can publish arbitrary metrics to the metrics interface which can then be queried. This doc contains instructions for publishing metrics using the [Java](#java-sdk) and [Python](#python-sdk) Pulsar Functions SDKs. + +> #### Metrics and stats not available through language-native interfaces +> If a Pulsar Function uses the language-native interface for [Java](functions-api.md#java-native) or [Python](#python-native), that function will not be able to publish metrics and stats to Pulsar. + +## Accessing metrics + +For a guide to accessing metrics created by Pulsar Functions, see the guide to [Monitoring](deploy-monitoring.md) in Pulsar. + +## Java SDK + +If you're creating a Pulsar Function using the [Java SDK](functions-api.md#java-sdk), the {% javadoc Context client org.apache.pulsar.functions.api.Context %} object has a `recordMetric` method that you can use to register both a name for the metric and a value. Here's the signature for that method: + +```java +void recordMetric(String metricName, double value); +``` + +Here's an example function: + +```java +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +public class MetricRecordingFunction implements Function { + @Override + public void apply(String input, Context context) { + context.recordMetric("number-of-characters", input.length()); + return null; + } +} +``` + +This function counts the length of each incoming message (of type `String`) and then registers that under the `number-of-characters` metric. + +## Python SDK + +Documentation for the [Python SDK](functions-api.md#python-sdk) is coming soon. diff --git a/site2/docs/functions-overview.md b/site2/docs/functions-overview.md new file mode 100644 index 0000000000000000000000000000000000000000..e0ea6d61a77d1ad21b7a036a290e52bf5f94962a --- /dev/null +++ b/site2/docs/functions-overview.md @@ -0,0 +1,451 @@ +--- +id: functions-overview +title: Pulsar Functions overview +sidebar_label: Overview +--- + +**Pulsar Functions** are lightweight compute processes that + +* consume messages from one or more Pulsar topics, +* apply a user-supplied processing logic to each message, +* publish the results of the computation to another topic + +Here's an example Pulsar Function for Java (using the [native interface](functions-api.md#java-native)): + +```java +import java.util.Function; + +public class ExclamationFunction implements Function { + @Override + public String apply(String input) { return String.format("%s!", input); } +} +``` + +Here's an equivalent function in Python (also using the [native interface](functions-api.md#python-native)): + +```python +def process(input): + return "{0}!".format(input) +``` + +Functions are executed each time a message is published to the input topic. If a function is listening on the topic `tweet-stream`, for example, then the function would be run each time a message is published to that topic. + +## Goals + +The core goal behind Pulsar Functions is to enable you to easily create processing logic of any level of complexity without needing to deploy a separate neighboring system (such as [Apache Storm](http://storm.apache.org/), [Apache Heron](https://apache.github.io/incubator-heron), [Apache Flink](https://flink.apache.org/), etc.). Pulsar Functions is essentially ready-made compute infrastructure at your disposal as part of your Pulsar messaging system. This core goal is tied to a series of other goals: + +* Developer productivity ([language-native](#native) vs. [Pulsar Functions SDK](#sdk) functions) +* Easy troubleshooting +* Operational simplicity (no need for an external processing system) + +## Inspirations + +The Pulsar Functions feature was inspired by (and takes cues from) several systems and paradigms: + +* Stream processing engines such as [Apache Storm](http://storm.apache.org/), [Apache Heron](https://apache.github.io/incubator-heron), and [Apache Flink](https://flink.apache.org) +* "Serverless" and "Function as a Service" (FaaS) cloud platforms like [Amazon Web Services Lambda](https://aws.amazon.com/lambda/), [Google Cloud Functions](https://cloud.google.com/functions/), and [Azure Cloud Functions](https://azure.microsoft.com/en-us/services/functions/) + +Pulsar Functions could be described as + +* [Lambda](https://aws.amazon.com/lambda/)-style functions that are +* specifically designed to use Pulsar as a message bus + +## Programming model + +The core programming model behind Pulsar Functions is very simple: + +* Functions receive messages from one or more **input {% popover topics %}**. Every time a message is received, the function can do a variety of things: + * Apply some processing logic to the input and write output to: + * An **output topic** in Pulsar + * [Apache BookKeeper](#state-storage) + * Write logs to a **log topic** (potentially for debugging purposes) + * Increment a [counter](#counters) + +![Pulsar Functions core programming model](/docs/assets/pulsar-functions-overview.png) + +### Word count example + +If you were to implement the classic word count example using Pulsar Functions, it might look something like this: + +![Pulsar Functions word count example](/docs/assets/pulsar-functions-word-count.png) + +If you were writing the function in [Java](functions-api.md#java) using the [Pulsar Functions SDK for Java](functions-api.md#java-sdk), you could write the function like this... + +```java +package org.example.functions; + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +import java.util.Arrays; + +public class WordCountFunction implements Function { + // This function is invoked every time a message is published to the input topic + @Override + public Void process(String input, Context context) { + Arrays.asList(input.split(" ")).forEach(word -> { + String counterKey = word.toLowerCase(); + context.incrCounter(counterKey, 1) + }); + return null; + } +} +``` + +...and then [deploy it](#cluster-mode) in your Pulsar cluster using the [command line](#cli) like this: + +```bash +$ bin/pulsar-admin functions create \ + --jar target/my-jar-with-dependencies.jar \ + --className org.example.functions.WordCountFunction \ + --tenant public \ + --namespace default \ + --name word-count \ + --inputs persistent://public/default/sentences \ + --output persistent://public/default/count +``` + +### Content-based routing example + +The use cases for Pulsar Functions are essentially endless, but let's dig into a more sophisticated example that involves content-based routing. + +Imagine a function that takes items (strings) as input and publishes them to either a fruits or vegetables topic, depending on the item. Or, if an item is neither a fruit nor a vegetable, a warning is logged to a [log topic](#logging). Here's a visual representation: + +![Pulsar Functions routing example](/docs/assets/pulsar-functions-routing-example.png) + +If you were implementing this routing functionality in Python, it might look something like this: + +```python +from pulsar import Function + +class RoutingFunction(Function): + def __init__(self): + self.fruits_topic = "persistent://public/default/fruits" + self.vegetables_topic = "persistent://public/default/vegetables" + + def is_fruit(item): + return item in ["apple", "orange", "pear", "other fruits..."] + + def is_vegetable(item): + return item in ["carrot", "lettuce", "radish", "other vegetables..."] + + def process(self, item, context): + if self.is_fruit(item): + context.publish(self.fruits_topic, item) + elif self.is_vegetable(item): + context.publish(self.vegetables_topic, item) + else: + warning = "The item {0} is neither a fruit nor a vegetable".format(item) + context.get_logger().warn(warning) +``` + +## Command-line interface + +Pulsar Functions are managed using the [`pulsar-admin`](reference-pulsar-admin.md) CLI tool (in particular the [`functions`](reference-pulsar-admin.md#pulsar-admin-functions) command). Here's an example command that would run a function in [local run mode](#local-run): + +```bash +$ bin/pulsar-functions localrun \ + --inputs persistent://public/default/test_src \ + --output persistent://public/default/test_result \ + --jar examples/api-examples.jar \ + --className org.apache.pulsar.functions.api.examples.ExclamationFunction +``` + +## Fully Qualified Function Name (FQFN) + +Each Pulsar Function has a **Fully Qualified Function Name** (FQFN) that consists of three elements: the function's tenant, namespace, and function name. FQFN's look like this: + +```http +tenant/namespace/name +``` + +FQFNs enable you to, for example, create multiple functions with the same name provided that they're in different namespaces. + +## Configuration + +Pulsar Functions can be configured in two ways: + +* Via [command-line arguments](#cli) passed to the [`pulsar-admin functions`](reference-pulsar-admin.md#pulsar-admin-functions) interface +* Via [YAML](http://yaml.org/) configuration files + +If you're supplying a YAML configuration, you must specify a path to the file on the command line. Here's an example: + +```bash +$ bin/pulsar-admin functions create \ + --functionConfigFile ./my-function.yaml +``` + +And here's an example `my-function.yaml` file: + +```yaml +name: my-function +tenant: public +namespace: default +jar: ./target/my-functions.jar +className: org.example.pulsar.functions.MyFunction +inputs: +- persistent://public/default/test_src +output: persistent://public/default/test_result +``` + +You can also mix and match configuration methods by specifying some function attributes via the CLI and others via YAML configuration. + +## Supported languages + +Pulsar Functions can currently be written in [Java](functions-api.md#java) and [Python](functions-api.md#python). Support for additional languages is coming soon. + +## The Pulsar Functions API + +The Pulsar Functions API enables you to create processing logic that is: + +* Type safe. Pulsar Functions can process raw bytes or more complex, application-specific types. +* Based on SerDe (**Ser**ialization/**De**serialization). A variety of types are supported "out of the box" but you can also create your own custom SerDe logic. + +### Function context + +Each Pulsar Function created using the [Pulsar Functions SDK](#sdk) has access to a context object that both provides: + +1. A wide variety of information about the function, including: + * The name of the function + * The tenant and namespace of the function + * [User-supplied configuration](#user-config) values +2. Special functionality, including: + * The ability to produce [logs](#logging) to a specified logging topic + * The ability to produce [metrics](#metrics) + +### Language-native functions + +Both Java and Python support writing "native" functions, i.e. Pulsar Functions with no dependencies. + +The benefit of native functions is that they don't have any dependencies beyond what's already available in Java/Python "out of the box." The downside is that they don't provide access to the function's [context](#context), which is necessary for a variety of functionality, including [logging](#logging), [user configuration](#user-config), and more. + +## The Pulsar Functions SDK + +If you'd like a Pulsar Function to have access to a [context object](#context), you can use the **Pulsar Functions SDK**, available for both [Java](functions-api.md#java-sdk) and [Pythnon](functions-api.md#python-sdk). + +### Java + +Here's an example Java function that uses information about its context: + +```java +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +public class ContextAwareFunction implements Function { + @Override + public Void process(String input, Context, context) { + Logger LOG = context.getLogger(); + String functionTenant = context.getTenant(); + String functionNamespace = context.getNamespace(); + String functionName = context.getName(); + LOG.info("Function tenant/namespace/name: {}/{}/{}", functionTenant, functionNamespace, functionName); + return null; + } +} +``` + +### Python + +Here's an example Python function that uses information about its context: + +```python +from pulsar import Function + +class ContextAwareFunction(Function): + def process(self, input, context): + log = context.get_logger() + function_tenant = context.get_function_tenant() + function_namespace = context.get_function_namespace() + function_name = context.get_function_name() + log.info("Function tenant/namespace/name: {0}/{1}/{2}".format(function_tenant, function_namespace, function_name)) +``` + +## Deployment + +The Pulsar Functions feature was built to support a variety of deployment options. At the moment, there are two ways to run Pulsar Functions: + +Deployment mode | Description +:---------------|:----------- +[Local run mode](#local-run) | The function runs in your local environment, for example on your laptop +[Cluster mode](#cluster-run) | The function runs *inside of* your Pulsar cluster, on the same machines as your Pulsar popover brokers + +### Local run mode + +If you run a Pulsar Function in **local run** mode, it will run on the machine from which the command is run (this could be your laptop, an [AWS EC2](https://aws.amazon.com/ec2/) instance, etc.). Here's an example [`localrun`](reference-pulsar-admin.md#pulsar-admin-functions-localrun) command: + +```bash +$ bin/pulsar-admin functions localrun \ + --py myfunc.py \ + --className myfunc.SomeFunction \ + --inputs persistent://public/default/input-1 \ + --output persistent://public/default/output-1 +``` + +By default, the function will connect to a Pulsar cluster running on the same machine, via a local broker service URL of `pulsar://localhost:6650`. If you'd like to use local run mode to run a function but connect it to a non-local Pulsar cluster, you can specify a different broker URL using the `--brokerServiceUrl` flag. Here's an example: + +```bash +$ bin/pulsar-admin functions localrun \ + --brokerServiceUrl pulsar://my-cluster-host:6650 \ + # Other function parameters +``` + +### Cluster run mode + +When you run a Pulsar Function in **cluster mode**, the function code will be uploaded to a Pulsar broker and run *alongside the broker* rather than in your [local environment](#local-run). You can run a function in cluster mode using the [`create`](reference-pulsar-admin.md#pulsar-admin-functions-create) command. Here's an example: + +```bash +$ bin/pulsar-admin functions create \ + --py myfunc.py \ + --className myfunc.SomeFunction \ + --inputs persistent://public/default/input-1 \ + --output persistent://public/default/output-1 +``` + +This command will upload `myfunc.py` to Pulsar, which will use the code to start one [or more](#parallelism) instances of the function. + +### Parallelism + +By default, only one **instance** of a Pulsar Function runs when you create and run it in [cluster run mode](#cluster-run). You can also, however, run multiple instances in parallel. You can specify the number of instances when you create the function, or update an existing single-instance function with a new parallelism factor. + +This command, for example, would create and run a function with a parallelism of 5 (i.e. 5 instances): + +```bash +$ bin/pulsar-admin functions create \ + --name parallel-fun \ + --tenant public \ + --namespace default \ + --py func.py \ + --className func.ParallelFunction \ + --parallelism 5 +``` + +### Function instance resources + +When you run Pulsar Functions in [cluster run](#cluster-run) mode, you can specify the resources that are assigned to each function [instance](#parallelism): + +Resource | Specified as... | Runtimes +:--------|:----------------|:-------- +CPU | The number of cores | Docker (coming soon) +RAM | The number of bytes | Process, Docker +Disk space | The number of bytes | Docker + +Here's an example function creation command that allocates 8 cores, 8 GB of RAM, and 10 GB of disk space to a function: + +```bash +$ bin/pulsar-admin functions create \ + --jar target/my-functions.jar \ + --className org.example.functions.MyFunction \ + --cpu 8 \ + --ram 8589934592 \ + --disk 10737418240 +``` + +For more information on resources, see the [Deploying and Managing Pulsar Functions](functions-deploying.md#resources) documentation. + +### Logging + +Pulsar Functions created using the [Pulsar Functions SDK(#sdk) can send logs to a log topic that you specify as part of the function's configuration. The function created using the command below, for example, would produce all logs on the `persistent://public/default/my-func-1-log` topic: + +```bash +$ bin/pulsar-admin functions create \ + --name my-func-1 \ + --logTopic persistent://public/default/my-func-1-log \ + # Other configs +``` + +Here's an example [Java function](functions-api.md#java-logging) that logs at different log levels based on the function's input: + +```java +public class LoggerFunction implements Function { + @Override + public Void process(String input, Context context) { + Logger LOG = context.getLogger(); + if (input.length() <= 100) { + LOG.info("This string has a length of {}", input); + } else { + LOG.warn("This string is getting too long! It has {} characters", input); + } + } +} +``` + +### User configuration + +Pulsar Functions can be passed arbitrary key-values via the command line (both keys and values must be strings). This set of key-values is called the functions **user configuration**. User configurations must consist of JSON strings. + +Here's an example of passing a user configuration to a function: + +```bash +$ bin/pulsar-admin functions create \ + --userConfig '{"key-1":"value-1","key-2","value-2"}' \ + # Other configs +``` + +Here's an example of a function that accesses that config map: + +```java +public class ConfigMapFunction implements Function { + @Override + public Void process(String input, Context context) { + String val1 = context.getUserConfigValue("key1").get(); + String val2 = context.getUserConfigValue("key2").get(); + context.getLogger().info("The user-supplied values are {} and {}", val1, val2); + return null; + } +} +``` + +### Triggering Pulsar Functions + +Pulsar Functions running in [cluster mode](#cluster-mode) can be [triggered](functions-deploying.md#triggering) via the [command line](#cli). With triggering you can easily pass a specific value to a function and get the function's return value *without* needing to worry about creating a client, sending a message to the right input topic, etc. Triggering can be very useful for---but is by no means limited to---testing and debugging purposes. + +> Triggering a function is ultimately no different from invoking a function by producing a message on one of the function's input topics. The [`pulsar-admin functions trigger`](reference-pulsar-admin.md#pulsar-admin-functions-trigger) command is essentially a convenient mechanism for sending messages to functions without needing to use the [`pulsar-client`](reference-pulsar-admin.md#pulsar-client) tool or a language-specific client library. + +Let's take an example Pulsar Function written in Python (using the [native interface](functions-api.md#python-native)) that simply reverses string inputs: + +```python +def process(input): + return input[::-1] +``` + +If that function were running in a Pulsar cluster, it could be triggered like this: + +```bash +$ bin/pulsar-admin functions trigger \ + --tenant public \ + --namespace default \ + --name reverse-func \ + --triggerValue "snoitcnuf raslup ot emoclew" +``` + +That should return `welcome to pulsar functions` as the console output. + +> Instead of passing in a string via the CLI, you can also trigger a Pulsar Function with the contents of a file using the `--triggerFile` flag. + +## Processing guarantees + +The Pulsar Functions feature provides three different messaging semantics that you can apply to any function: + +Delivery semantics | Description +:------------------|:------- +**At-most-once** delivery | Each message that is sent to the function will most likely be processed but also may not be (hence the "at most") +**At-least-once** delivery | Each message that is sent to the function could be processed more than once (hence the "at least") +**Effectively-once** delivery | Each message that is sent to the function will have one output associated with it + +This command, for example, would run a function in [cluster mode](#cluster-mode) with effectively-once guarantees applied: + +```bash +$ bin/pulsar-admin functions create \ + --name my-effectively-once-function \ + --processingGuarantees EFFECTIVELY_ONCE \ + # Other function configs +``` + +## Metrics + +Pulsar Functions that use the [Pulsar Functions SDK](#sdk) can publish metrics to Pulsar. For more information, see [Metrics for Pulsar Functions](functions-metrics.md). + +## State storage + +Pulsar Functions use [Apache BookKeeper](https://bookkeeper.apache.org) as a state storage interface. All Pulsar installations, including local standalone installations, include a deployment of BookKeeper bookies. diff --git a/site2/docs/functions-quickstart.md b/site2/docs/functions-quickstart.md new file mode 100644 index 0000000000000000000000000000000000000000..e8a9c4ddb0c6617794c326decf3a914638482d83 --- /dev/null +++ b/site2/docs/functions-quickstart.md @@ -0,0 +1,263 @@ +--- +id: functions-quickstart +title: Getting started with Pulsar Functions +sidebar_label: Getting started +--- + +This tutorial will walk you through running a {% popover standalone %} Pulsar {% popover cluster %} on your machine and then running your first Pulsar Functions using that cluster. The first function will run in local run mode (outside your Pulsar {% popover cluster %}), while the second will run in cluster mode (inside your cluster). + +> In local run mode, your Pulsar Function will communicate with your Pulsar cluster but will run outside of the cluster. + +## Prerequisites + +In order to follow along with this tutorial, you'll need to have [Maven](https://maven.apache.org/download.cgi) installed on your machine. + +## Run a standalone Pulsar cluster + +In order to run our Pulsar Functions, we'll need to run a Pulsar cluster locally first. The easiest way to do that is to run Pulsar in {% popover standalone %} mode. Follow these steps to start up a standalone cluster: + +```bash +$ wget https://repository.apache.org/content/repositories/snapshots/org/apache/pulsar/distribution/2.0.0-incubating-SNAPSHOT/distribution-2.0.0-incubating-{{ site.preview_version_id }}-bin.tar.gz +$ tar xvf distribution-2.0.0-incubating-{{ site.preview_version_id }}-bin.tar.gz +$ cd apache-pulsar-2.0.0-incubating-SNAPSHOT +$ bin/pulsar standalone \ + --advertised-address 127.0.0.1 +``` + +When running Pulsar in standalone mode, the `public` tenant and `default` namespace will be created automatically for you. That tenant and namespace will be used throughout this tutorial. + +## Run a Pulsar Function in local run mode + +Let's start with a simple function that takes a string as input from a Pulsar topic, adds an exclamation point to the end of the string, and then publishes that new string to another Pulsar topic. Here's the code for the function: + +```java +package org.apache.pulsar.functions.api.examples; + +import java.util.function.Function; + +public class ExclamationFunction implements Function { + @Override + public String apply(String input) { + return String.format("%s!", input); + } +} +``` + +A JAR file containing this and several other functions (written in Java) is included with the binary distribution you downloaded above (in the `examples` folder). To run the function in local mode, i.e. on our laptop but outside our Pulsar cluster: + +```bash +$ bin/pulsar-admin functions localrun \ + --jar examples/api-examples.jar \ + --className org.apache.pulsar.functions.api.examples.ExclamationFunction \ + --inputs persistent://public/default/exclamation-input \ + --output persistent://public/default/exclamation-output \ + --name exclamation +``` + +> #### Multiple input topics allowed +> In the example above, a single topic was specified using the `--inputs` flag. You can also specify multiple input topics as a comma-separated list using the same flag. Here's an example: +> ```bash +> --inputs topic1,topic2 +> ``` + +We can open up another shell and use the [`pulsar-client`](reference-pulsar-admin.md#pulsar-client) tool to listen for messages on the output topic: + +```bash +$ bin/pulsar-client consume persistent://public/default/exclamation-output \ + --subscription-name my-subscription \ + --num-messages 0 +``` + +> Setting the `--num-messages` flag to 0 means that the consumer will listen on the topic indefinitely (rather than only accepting a certain number of messages). + +With a listener up and running, we can open up another shell and produce a message on the input topic that we specified: + +```bash +$ bin/pulsar-client produce persistent://public/default/exclamation-input \ + --num-produce 1 \ + --messages "Hello world" +``` + +In the output, you should see the following: + +``` +----- got message ----- +Hello world! +``` + +Success! As you can see, the message has been successfully processed by the exclamation function. To shut down the function, simply hit **Ctrl+C**. + +Here's what happened: + +* The `Hello world` message that we published to the input topic (`persistent://public/default/exclamation-input`) was passed to the exclamation function that we ran on our machine +* The exclamation function processed the message (providing a result of `Hello world!`) and published the result to the output topic (`persistent://public/default/exclamation-output`). +* If our exclamation function *hadn't* been running, Pulsar would have durably stored the message data published to the input topic in [Apache BookKeeper](https://bookkeeper.apache.org) until a consumer consumed and acknowledged the message + +## Run a Pulsar Function in cluster mode + +[Local run mode](#local-run-mode) is useful for development and experimentation, but if you want to use Pulsar Functions in a real Pulsar deployment, you'll want to run them in **cluster mode**. In this mode, Pulsar Functions run *inside* your Pulsar cluster and are managed using the same [`pulsar-admin functions`](reference-pulsar-admin.md#pulsar-admin-functions) interface that we've been using thus far. + +This command, for example, would deploy the same exclamation function we ran locally above *in our Pulsar cluster* (rather than outside it): + +```bash +$ bin/pulsar-admin functions create \ + --jar examples/api-examples.jar \ + --className org.apache.pulsar.functions.api.examples.ExclamationFunction \ + --inputs persistent://public/default/exclamation-input \ + --output persistent://public/default/exclamation-output \ + --name exclamation +``` + +You should see `Created successfully` in the output. Now, let's see a list of functions running in our cluster: + +```bash +$ bin/pulsar-admin functions list \ + --tenant public \ + --namespace default +``` + +We should see just the `exclamation` function listed there. We can also check the status of our deployed function using the `getstatus` command: + +```bash +$ bin/pulsar-admin functions getstatus \ + --tenant public \ + --namespace default \ + --name exclamation +``` + +You should see this JSON output: + +```json +{ + "functionStatusList": [ + { + "running": true, + "instanceId": "0" + } + ] +} +``` + +As we can see, (a) the instance is currently running and (b) there is one instance, with an ID of 0, running. We can get other information about the function (topics, tenant, namespace, etc.) using the `get` command instead of `getstatus`: + +```bash +$ bin/pulsar-admin functions get \ + --tenant public \ + --namespace default \ + --name exclamation +``` + +You should see this JSON output: + +```json +{ + "tenant": "public", + "namespace": "default", + "name": "exclamation", + "className": "org.apache.pulsar.functions.api.examples.ExclamationFunction", + "output": "persistent://public/default/exclamation-output", + "autoAck": true, + "inputs": [ + "persistent://public/default/exclamation-input" + ], + "parallelism": 1 +} +``` + +As we can see, the parallelism of the function is 1, meaning that only one instance of the function is running in our cluster. Let's update our function to a parallelism of 3 using the `update` command: + +```bash +$ bin/pulsar-admin functions update \ + --jar examples/api-examples.jar \ + --className org.apache.pulsar.functions.api.examples.ExclamationFunction \ + --inputs persistent://public/default/exclamation-input \ + --output persistent://public/default/exclamation-output \ + --tenant public \ + --namespace default \ + --name exclamation \ + --parallelism 3 +``` + +You should see `Updated successfully` in the output. If you run the `get` command from above for the function, you can see that the parallelism has increased to 3, meaning that there are now three instances of the function running in our cluster: + +```json +{ + "tenant": "public", + "namespace": "default", + "name": "exclamation", + "className": "org.apache.pulsar.functions.api.examples.ExclamationFunction", + "output": "persistent://public/default/exclamation-output", + "autoAck": true, + "inputs": [ + "persistent://public/default/exclamation-input" + ], + "parallelism": 3 +} +``` + +Finally, we can shut down our running function using the `delete` command: + +```bash +$ bin/pulsar-admin functions delete \ + --tenant public \ + --namespace default \ + --name exclamation +``` + +If you see `Deleted successfully` in the output, then you've succesfully run, updated, and shut down a Pulsar Function running in cluster mode. Congrats! Now, let's go even further and run a brand new function in the next section. + +## Writing and running a new function + +> In order to write and run the [Python](functions-api.md#python) function below, you'll need to install a few dependencies: +> ```bash +> $ pip install pulsar-client protobuf futures grpcio grpcio-tools +> ``` + +In the above examples, we ran and managed a pre-written Pulsar Function and saw how it worked. To really get our hands dirty, let's write and our own function from scratch, using the Python API. This simple function will also take a string as input but it will reverse the string and publish the resulting, reversed string to the specified topic. + +First, create a new Python file: + +```bash +$ touch reverse.py +``` + +In that file, add the following: + +```python +def process(input): + return input[::-1] +``` + +Here, the `process` method defines the processing logic of the Pulsar Function. It simply uses some Python slice magic to reverse each incoming string. Now, we can deploy the function using `create`: + +```bash +$ bin/pulsar-admin functions create \ + --py reverse.py \ + --className reverse \ + --inputs persistent://public/default/backwards \ + --output persistent://public/default/forwards \ + --tenant public \ + --namespace default \ + --name reverse +``` + +If you see `Created successfully`, the function is ready to accept incoming messages. Because the function is running in cluster mode, we can **trigger** the function using the [`trigger`](reference-pulsar-admin.md#pulsar-admin-functions-trigger) command. This command will send a message that we specify to the function and also give us the function's output. Here's an example: + +```bash +$ bin/pulsar-admin functions trigger \ + --name reverse \ + --tenant public \ + --namespace default \ + --triggerValue "sdrawrof won si tub sdrawkcab saw gnirts sihT" +``` + +You should get this output: + +``` +This string was backwards but is now forwards +``` + +Once again, success! We created a brand new Pulsar Function, deployed it in our Pulsar standalone cluster in [cluster mode](#cluster-mode) and successfully triggered the function. If you're ready for more, check out one of these docs: + +* [The Pulsar Functions API](functions-api.md) +* [Deploying Pulsar Functions](functions-deploying.md) diff --git a/site2/docs/getting-started-clients.md b/site2/docs/getting-started-clients.md new file mode 100644 index 0000000000000000000000000000000000000000..efb67c588ac69118ffcbf5720dcefcab84af2270 --- /dev/null +++ b/site2/docs/getting-started-clients.md @@ -0,0 +1,41 @@ +--- +id: client-libraries +title: Pulsar client libraries +sidebar_label: Client libraries +--- + +Pulsar currently has client libraries available for three languages: + +* [Java](#java-client) +* [Go](#go-client) +* [Python](#python-client) +* [C++](#c-client) + +## Java client + +For a tutorial on using the Pulsar Java client to produce and consume messages, see [The Pulsar Java client](client-libraries-java.md). + +There are also two independent sets of Javadoc API docs available: + +Library | Purpose +:-------|:------- +[`org.apache.pulsar.client.api`](/api/client) | The [Pulsar Java client](client-libraries-java.md) for producing and consuming messages on Pulsar topics +[`org.apache.pulsar.client.admin`](/api/admin) | The Java client for the [Pulsar admin interface](admin-api-overview.md) + + +## Go client + +For a tutorial on using the Pulsar Go client, see [The Pulsar Go client](client-libraries-go.md). + + +## Python client + +For a tutorial on using the Pulsar Python client, see [The Pulsar Python client](client-libraries-python.md). + +There are also [pdoc](https://github.com/BurntSushi/pdoc)-generated API docs for the Python client [here](/api/python). + +## C++ client + +For a tutorial on using the Pulsar C++ clent, see [The Pulsar C++ client](client-libraries-cpp.md). + +There are also [Doxygen](http://www.stack.nl/~dimitri/doxygen/)-generated API docs for the C++ client [here]({{ site.baseurl }}api/cpp). diff --git a/site2/docs/getting-started-concepts-and-architecture.md b/site2/docs/getting-started-concepts-and-architecture.md new file mode 100644 index 0000000000000000000000000000000000000000..3ee3c18db55fb5ad9e0162243d3cd73ac0278751 --- /dev/null +++ b/site2/docs/getting-started-concepts-and-architecture.md @@ -0,0 +1,757 @@ +--- +id: concepts-architecture +title: Pulsar concepts and architecture +sidebar_label: Concepts and architecture +--- + +Pulsar is a multi-tenant, high-performance solution for server-to-server messaging originally developed by [Yahoo](http://yahoo.github.io/) and now under the stewardship of the [Apache Software Foundation](https://www.apache.org/). + +Pulsar's key features include: + +* Native support for multiple clusters in a Pulsar instance, with seamless [geo-replication](administration-geo.md) of messages across clusters +* Very low publish and end-to-end latency +* Seamless scalability out to over a million topics +* A simple [client API](#client-api) with bindings for [Java](client-libraries-java.md), [Python](client-libraries-python.md), and [C++](client-libraries-cpp.md) +* Multiple [subscription modes](#subscription-modes) for topics ([exclusive](#exclusive), [shared](#shared), and [failover](#failover)) +* Guaranteed message delivery with [persistent message storage](#persistent-storage) provided by [Apache BookKeeper](http://bookkeeper.apache.org/) + +## Messages + +Messages are the basic "unit" of Pulsar. They're what producers publish to topics and what consumers then consume from topics (and acknowledge when the message has been processed). Messages are the analogue of letters in a postal service system. + +Component | Purpose +:---------|:------- +Value / data payload | The data carried by the message. All Pulsar messages carry raw bytes, although message data can also conform to data [schemas](#schema-registry) +Key | Messages can optionally be tagged with keys, which can be useful for things like [topic compaction](#compaction) +Properties | An optional key/value map of user-defined properties +Producer name | The name of the producer that produced the message (producers are automatically given default names, but you can apply your own explicitly as well) +Sequence ID | Each Pulsar message belongs to an ordered sequence on its topic. A message's sequence ID is its ordering in that sequence. +Publish time | The timestamp of when the message was published (automatically applied by the producer) +Event time | An optional timestamp that applications can attach to the message representing when something happened, e.g. when the message was processed. The event time of a message is 0 if none is explicitly set. + + +> For a more in-depth breakdown of Pulsar message contents, see the documentation on Pulsar's [binary protocol](developing-binary-protocol.md). + +## Producers, consumers, topics, and subscriptions + +Pulsar is built on the [publish-subscribe](https://en.wikipedia.org/wiki/Publish%E2%80%93subscribe_pattern) pattern, aka pub-sub. In this pattern, [producers](#producers) publish messages to [topics](#topics). [Consumers](#consumers) can then [subscribe](#subscription-modes) to those topics, process incoming messages, and send an acknowledgement when processing is complete. + +Once a subscription has been created, all messages will be [retained](#persistent-storage) by Pulsar, even if the consumer gets disconnected. Retained messages will be discarded only when a consumer acknowledges that they've been successfully processed. + +### Producers + +A producer is a process that attaches to a topic and publishes messages to a Pulsar {% popover broker %} for processing. + +#### Send modes + +Producers can send messages to brokers either synchronously (sync) or asynchronously (async). + +| Mode | Description | +|:-----------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Sync send | The producer will wait for acknowledgement from the broker after sending each message. If acknowledgment isn't received then the producer will consider the send operation a failure. | +| Async send | The producer will put the message in a blocking queue and return immediately. The client library will then send the message to the broker in the background. If the queue is full (max size [configurable](reference-configuration.md#broker), the producer could be blocked or fail immediately when calling the API, depending on arguments passed to the producer. | + +#### Compression + +Messages published by producers can be compressed during transportation in order to save bandwidth. Pulsar currently supports two types of compression: + +* [LZ4](https://github.com/lz4/lz4) +* [ZLIB](https://zlib.net/) + +#### Batching + +If batching is enabled, the producer will accumulate and send a batch of messages in a single request. Batching size is defined by the maximum number of messages and maximum publish latency. + +### Consumers + +A consumer is a process that attaches to a topic via a subscription and then receives messages. + +#### Receive modes + +Messages can be received from {% popover brokers %} either synchronously (sync) or asynchronously (async). + +| Mode | Description | +|:--------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Sync receive | A sync receive will be blocked until a message is available. | +| Async receive | An async receive will return immediately with a future value---a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture) in Java, for example---that completes once a new message is available. | + +#### Acknowledgement + +When a consumer has successfully processed a message, it needs to send an acknowledgement to the broker so that the broker can discard the message (otherwise it [stores](#persistent-storage) the message). + +Messages can be acknowledged either one by one or cumulatively. With cumulative acknowledgement, the consumer only needs to acknowledge the last message it received. All messages in the stream up to (and including) the provided message will not be re-delivered to that consumer. + + +> Cumulative acknowledgement cannot be used with [shared subscription mode](#subscription-modes), because shared mode involves multiple consumers having access to the same subscription. + +#### Listeners + +Client libraries can provide their own listener implementations for consumers. The [Java client](client-libraries-java.md), for example, provides a {% javadoc MesssageListener client org.apache.pulsar.client.api.MessageListener %} interface. In this interface, the `received` method is called whenever a new message is received. + +### Topics + +As in other pub-sub systems, topics in Pulsar are named channels for transmitting messages from {% popover producers %} to {% popover consumers %}. Topic names are URLs that have a well-defined structure: + +```http +{persistent|non-persistent}://tenant/namespace/topic +``` + +Topic name component | Description +:--------------------|:----------- +`persistent` / `non-persistent` | This identifies the type of topic. Pulsar supports two kind of topics: [persistent](#persistent-storage) and [non-persistent](#non-persistent-topics) (persistent is the default, so if you don't specify a type the topic will be persistent). With persistent topics, all messages are durably [persisted](#persistent-storage) on disk (that means on multiple disks unless the broker is standalone), whereas data for [non-persistent](#non-persistent-topics) topics isn't persisted to storage disks. +`tenant` | The topic's tenant within the instance. Tenants are essential to multi-tenancy in Pulsar and can be spread across clusters. +`namespace` | The administrative unit of the topic, which acts as a grouping mechanism for related topics. Most topic configuration is performed at the [namespace](#namespace) level. Each tenant can have multiple namespaces. +`topic` | The final part of the name. Topic names are freeform and have no special meaning in a Pulsar instance. + + +> #### No need to explicitly create new topics +> You don't need to explicitly create topics in Pulsar. If a client attempts to write or receive messages to/from a topic that does not yet exist, Pulsar will automatically create that topic under the [namespace](#namespace) provided in the [topic name](#topics). + + +### Namespaces + +A namespace is a logical nomenclature within a tenant. A tenant can create multiple namespaces via the [admin API](admin-api-namespaces.md#create). For instance, a tenant with different applications can create a separate namespace for each application. A namespace allows the application to create and manage a hierarchy of topics. The topic `my-tenant/app1` is a namespace for the application `app1` for `my-tenant`. You can create any number of [topics](#topics) under the namespace. + +### Subscription modes + +A subscription is a named configuration rule that determines how messages are delivered to consumers. There are three available subscription modes in Pulsar: [exclusive](#exclusive), [shared](#shared), and [failover](#failover). These modes are illustrated in the figure below. + +![Subscription modes](/docs/assets/pulsar-subscription-modes.png) + +#### Exclusive + +In *exclusive* mode, only a single consumer is allowed to attach to the subscription. If more than one consumer attempts to subscribe to a topic using the same subscription, the consumer receives an error. + +In the diagram above, only **Consumer-A** is allowed to consume messages. + +> Exclusive mode is the default subscription mode. + +![Exclusive subscriptions](/docs/assets/pulsar-exclusive-subscriptions.png) + +#### Shared + +In *shared* or *round robin* mode, multiple consumers can attach to the same subscription. Messages are delivered in a round robin distribution across consumers, and any given message is delivered to only one consumer. When a consumer disconnects, all the messages that were sent to it and not acknowledged will be rescheduled for sending to the remaining consumers. + +In the diagram above, **Consumer-B-1** and **Consumer-B-2** are able to subscribe to the topic, but **Consumer-C-1** and others could as well. + +> #### Limitations of shared mode +> There are two important things to be aware of when using shared mode: +> * Message ordering is not guaranteed. +> * You cannot use cumulative acknowledgment with shared mode. + +![Shared subscriptions](/docs/assets/pulsar-shared-subscriptions.png) + +#### Failover + +In *failover* mode, multiple consumers can attach to the same subscription. The consumers will be lexically sorted by the consumer's name and the first consumer will initially be the only one receiving messages. This consumer is called the *master consumer*. + +When the master consumer disconnects, all (non-acked and subsequent) messages will be delivered to the next consumer in line. + +In the diagram above, Consumer-C-1 is the master consumer while Consumer-C-2 would be the next in line to receive messages if Consumer-C-2 disconnected. + +![Failover subscriptions](/docs/assets/pulsar-failover-subscriptions.png) + +### Multi-topic subscriptions + +When a consumer subscribes to a Pulsar topic, by default it subscribes to one specific topic, such as `persistent://public/default/my-topic`. As of Pulsar version 1.23.0-incubating, however, Pulsar consumers can simultaneously subscribe to multiple topics. You can define a list of topics in two ways: + +* On the basis of a [**reg**ular **ex**pression](https://en.wikipedia.org/wiki/Regular_expression) (regex), for example `persistent://public/default/finance-.*` +* By explicitly defining a list of topics + +> When subscribing to multiple topics by regex, all topics must be in the same [namespace](#namespaces) + +When subscribing to multiple topics, the Pulsar client will automatically make a call to the Pulsar API to discover the topics that match the regex pattern/list and then subscribe to all of them. If any of the topics don't currently exist, the consumer will auto-subscribe to them once the topics are created. + +> #### No ordering guarantees +> When a consumer subscribes to multiple topics, all ordering guarantees normally provided by Pulsar on single topics do not hold. If your use case for Pulsar involves any strict ordering requirements, we would strongly recommend against using this feature. + +Here are some multi-topic subscription examples for Java: + +```java +import java.util.regex.Pattern; + +import org.apache.pulsar.client.api.Consumer; +import org.apache.pulsar.client.api.PulsarClient; + +PulsarClient pulsarClient = // Instantiate Pulsar client object + +// Subscribe to all topics in a namespace +Pattern allTopicsInNamespace = Pattern.compile("persistent://public/default/.*"); +Consumer allTopicsConsumer = pulsarClient.subscribe(allTopicsInNamespace, "subscription-1"); + +// Subscribe to a subsets of topics in a namespace, based on regex +Pattern someTopicsInNamespace = Pattern.compile("persistent://public/default/foo.*"); +Consumer someTopicsConsumer = pulsarClient.subscribe(someTopicsInNamespace, "subscription-1"); +``` + +For code examples, see: + +* [Java](client-libraries-java.md#multi-topic-subscriptions) + +### Partitioned topics + +Normal topics can be served only by a single broker, which limits the topic's maximum throughput. *Partitioned topics* are a special type of topic that be handled by multiple brokers, which allows for much higher throughput. + +Behind the scenes, a partitioned topic is actually implemented as N internal topics, where N is the number of partitions. When publishing messages to a partitioned topic, each message is routed to one of several brokers. The distribution of partitions across brokers is handled automatically by Pulsar. + +The diagram below illustrates this: + +![](/docs/assets/partitioning.png) + +Here, the topic **Topic1** has five partitions (**P0** through **P4**) split across three brokers. Because there are more partitions than brokers, two brokers handle two partitions a piece, while the third handles only one (again, Pulsar handles this distribution of partitions automatically). + +Messages for this topic are broadcast to two consumers. The [routing mode](#routing-modes) determines both which broker handles each partition, while the [subscription mode](getting-started-concepts-and-architecture.md#subscription-modes) determines which messages go to which consumers. + +Decisions about routing and subscription modes can be made separately in most cases. In general, throughput concerns should guide partitioning/routing decisions while subscription decisions should be guided by application semantics. + +There is no difference between partitioned topics and normal topics in terms of how subscription modes work, as partitioning only determines what happens between when a message is published by a producer and processed and acknowledged by a consumer. + +Partitioned topics need to be explicitly created via the [admin API](admin-api-overview.md). The number of partitions can be specified when creating the topic. + +#### Routing modes + +When publishing to partitioned topics, you must specify a *routing mode*. The routing mode determines which partition---that is, which internal topic---each message should be published to. + +There are three routing modes available by default: + +Mode | Description | Ordering guarantee +:----|:------------|:------------------ +Key hash | If a key property has been specified on the message, the partitioned producer will hash the key and assign it to a particular partition. | Per-key-bucket ordering +Single default partition | If no key is provided, each producer's message will be routed to a dedicated partition, initially random selected | Per-producer ordering +Round robin distribution | If no key is provided, all messages will be routed to different partitions in round-robin fashion to achieve maximum throughput. | None + +In addition to these default modes, you can also create a custom routing mode if you're using the [Java client](client-libraries-java.md) by implementing the {% javadoc MessageRouter client org.apache.pulsar.client.api.MessageRouter %} interface. + + + +### Non-persistent topics + + +By default, Pulsar persistently stores *all* unacknowledged messages on multiple [BookKeeper](#persistent-storage) bookies (storage nodes). Data for messages on persistent topics can thus survive broker restarts and subscriber failover. + +Pulsar also, however, supports **non-persistent topics**, which are topics on which messages are *never* persisted to disk and live only in memory. When using non-persistent delivery, killing a Pulsar broker or disconnecting a subscriber to a topic means that all in-transit messages are lost on that (non-persistent) topic, meaning that clients may see message loss. + +Non-persistent topics have names of this form (note the `non-persistent` in the name): + +```http +non-persistent://tenant/namespace/topic +``` + +> For more info on using non-persistent topics, see the [Non-persistent messaging cookbook](cookbooks-non-persistent.md). + +In non-persistent topics, brokers immediately deliver messages to all connected subscribers *without persisting them* in [BookKeeper](#persistent-storage). If a subscriber is disconnected, the broker will not be able to deliver those in-transit messages, and subscribers will never be able to receive those messages again. Eliminating the persistent storage step makes messaging on non-persistent topics slightly faster than on persistent topics in some cases, but with the caveat that some of the core benefits of Pulsar are lost. + +> With non-persistent topics, message data lives only in memory. If a message broker fails or message data can otherwise not be retrieved from memory, your message data may be lost. Use non-persistent topics only if you're *certain* that your use case requires it and can sustain it. + +By default, non-persistent topics are enabled on Pulsar brokers. You can disable them in the broker's [configuration](reference-configuration.md#broker-enableNonPersistentTopics). You can manage non-persistent topics using the [`pulsar-admin non-persistent`](reference-cli-tools.md#pulsar-admin-non-persistent) interface. + +#### Performance + +Non-persistent messaging is usually faster than persistent messaging because brokers don't persist messages and immediately send acks back to the producer as soon as that message is deliver to all connected subscribers. Producers thus see comparatively low publish latency with non-persistent topic. + +#### Client API + +Producers and consumers can connect to non-persistent topics in the same way as persistent topics, with the crucial difference that the topic name must start with `non-persistent`. All three subscription modes---[exclusive](#exclusive), [shared](#shared), and [failover](#failover)---are supported for non-persistent topics. + +Here's an example [Java consumer](client-libraries-java.md#consumer) for a non-persistent topic: + +```java +PulsarClient client = PulsarClient.create("pulsar://localhost:6650"); +String npTopic = "non-persistent://public/default/my-topic"; +String subscriptionName = "my-subscription-name"; + +Consumer consumer = client.subscribe(npTopic, subscriptionName); +``` + +Here's an example [Java producer](client-libraries-java.md#producer) for the same non-persistent topic: + +```java +Producer producer = client.createProducer(npTopic); +``` + +## Architecture overview + +At the highest level, a Pulsar instance is composed of one or more Pulsar clusters. Clusters within an instance can [replicate](#replicate) data amongst themselves. + +In a Pulsar cluster: + +* One or more brokers handles and load balances incoming messages from producers, dispatches messages to consumers, communicates with the Pulsar configuration store to handle various coordination tasks, stores messages in BookKeeper instances (aka bookies), relies on a cluster-specific ZooKeeper cluster for certain tasks, and more. +* A BookKeeper cluster consisting of one more or more bookies handles [persistent storage](#persistent-storage) of messages. +* A ZooKeeper cluster specific to that cluster handles + +The diagram below provides an illustration of a Pulsar cluster: + +![Pulsar architecture diagram](/docs/assets/pulsar-system-architecture.png) + +At the broader instance level, an instance-wide ZooKeeper cluster called the configuration store handles coordination tasks involving multiple clusters, for example [geo-replication](#replication). + +## Brokers + +The Pulsar message broker is a stateless component that's primarily responsible for running two other components: + +* An HTTP server that exposes a REST API for both [administrative tasks](reference-rest-api.md) and [topic lookup](#client-setup-phase) for producers and consumers +* A dispatcher, which is an asynchronous TCP server over a custom [binary protocol](developing-binary-protocol.md) used for all data transfers + +Messages are typically dispatched out of a [managed ledger](#managed-ledger) cache for the sake of performance, *unless* the backlog exceeds the cache size. If the backlog grows too large for the cache, the broker will start reading entries from BookKeeper. + +Finally, to support geo-replication on global topics, the broker manages replicators that tail the entries published in the local region and republish them to the remote region using the Pulsar [Java client library](client-libraries-java.md). + +> For a guide to managing Pulsar brokers, see the [brokers](admin-api-brokers.md) guide. + +## Clusters + +A Pulsar instance consists of one or more Pulsar *clusters*. Clusters, in turn, consist of: + +* One or more Pulsar [brokers](#broker) +* A ZooKeeper quorum used for cluster-level configuration and coordination +* An ensemble of bookies used for [persistent storage](#persistent-storage) of messages + +Clusters can replicate amongst themselves using [geo-replication](#geo-replication). + +> For a guide to managing Pulsar clusters, see the [Clusters and brokers](admin-api-brokers#managing-clusters) guide. + +## Metadata store + +Pulsar uses [Apache Zookeeper](https://zookeeper.apache.org/) for metadata storage, cluster configuration, and coordination. In a Pulsar instance: + +* A configuration store quorum stores configuration for tenants, namespaces, and other entities that need to be globally consistent. +* Each cluster has its own local ZooKeeper ensemble that stores cluster-specific configuration and coordination such as ownership metadata, broker load reports, BookKeeper ledger metadata, and more. + +## Persistent storage + +Pulsar provides guaranteed message delivery for applications. If a message successfully reaches a Pulsar broker, it will be delivered to its intended target. + +This guarantee requires that non-acknowledged messages are stored in a durable manner until they can be delivered to and acknowledged by consumers. This mode of messaging is commonly called *persistent messaging*. In Pulsar, N copies of all messages are stored and synced on disk, for example 4 copies across two servers with mirrored [RAID](https://en.wikipedia.org/wiki/RAID) volumes on each server. + +### Apache BookKeeper + +Pulsar uses a system called [Apache BookKeeper](http://bookkeeper.apache.org/) for persistent message storage. BookKeeper is a distributed [write-ahead log](https://en.wikipedia.org/wiki/Write-ahead_logging) (WAL) system that provides a number of crucial advantages for Pulsar: + +* It enables Pulsar to utilize many independent logs, called [ledgers](#ledgers). Multiple ledgers can be created for topics over time. +* It offers very efficient storage for sequential data that handles entry replication. +* It guarantees read consistency of ledgers in the presence of various system failures. +* It offers even distribution of I/O across bookies. +* It's horizontally scalable in both capacity and throughput. Capacity can be immediately increased by adding more bookies to a cluster. +* Bookies are designed to handle thousands of ledgers with concurrent reads and writes. By using multiple disk devices---one for journal and another for general storage--bookies are able to isolate the effects of read operations from the latency of ongoing write operations. + +In addition to message data, *cursors* are also persistently stored in BookKeeper. Cursors are {% popover subscription %} positions for {% popover consumers %}. BookKeeper enables Pulsar to store consumer position in a scalable fashion. + +At the moment, Pulsar only supports persistent message storage. This accounts for the `persistent` in all topic names. Here's an example: + +```http +persistent://my-property/my-namespace/my-topic +``` + +> Pulsar also supports ephemeral ([non-persistent](#non-persistent-topics)) message storage. + + +You can see an illustration of how brokers and bookies interact in the diagram below: + +![Brokers and bookies](/docs/assets/broker-bookie.png) + + +### Ledgers + +A ledger is an append-only data structure with a single writer that is assigned to multiple BookKeeper storage nodes, or bookies. Ledger entries are replicated to multiple bookies. Ledgers themselves have very simple semantics: + +* A Pulsar broker can create a ledger, append entries to the ledger, and close the ledger. +* After the ledger has been closed---either explicitly or because the writer process crashed---it can then be opened only in read-only mode. +* Finally, when entries in the ledger are no longer needed, the whole ledger can be deleted from the system (across all bookies). + +#### Ledger read consistency + +The main strength of Bookkeeper is that it guarantees read consistency in ledgers in the presence of failures. Since the ledger can only be written to by a single process, that process is free to append entries very efficiently, without need to obtain consensus. After a failure, the ledger will go through a recovery process that will finalize the state of the ledger and establish which entry was last committed to the log. After that point, all readers of the ledger are guaranteed to see the exact same content. + +#### Managed ledgers + +Given that Bookkeeper ledgers provide a single log abstraction, a library was developed on top of the ledger called the *managed ledger* that represents the storage layer for a single topic. A managed ledger represents the abstraction of a stream of messages with a single writer that keeps appending at the end of the stream and multiple cursors that are consuming the stream, each with its own associated position. + +Internally, a single managed ledger uses multiple BookKeeper ledgers to store the data. There are two reasons to have multiple ledgers: + +1. After a failure, a ledger is no longer writable and a new one needs to be created. +2. A ledger can be deleted when all cursors have consumed the messages it contains. This allows for periodic rollover of ledgers. + +### Journal storage + +In BookKeeper, *journal* files contain BookKeeper transaction logs. Before making an update to a [ledger](#ledgers), a bookie needs to ensure that a transaction describing the update is written to persistent (non-volatile) storage. A new journal file is created once the bookie starts or the older journal file reaches the journal file size threshold (configured using the [`journalMaxSizeMB`](reference-configuration.md#bookkeeper-journalMaxSizeMB) parameter). + +### Non-persistent storage + +A future version of BookKeeper will support *non-persistent messaging* and thus multiple durability modes at the topic level. This will enable you to set the durability mode at the topic level, replacing the `persistent` in topic names with a `non-persistent` indicator. + +## Message retention and expiry + +By default, Pulsar message brokers: + +* immediately delete *all* messages that have been acknowledged by a consumer, and +* [persistently store](#persistent-storage) all unacknowledged messages in a message backlog. + +Pulsar has two features, however, that enable you to override this default behavior: + +* Message **retention** enables you to store messages that have been acknowledged by a consumer +* Message **expiry** enables you to set a time to live (TTL) for messages that have not yet been acknowledged + +> All message retention and expiry is managed at the [namespace](#namespaces) level. For a how-to, see the [Message retention and expiry](cookbooks-retention-expiry.md) cookbook. + +The diagram below illustrates both concepts: + +![Message retention and expiry](/docs/assets/retention-expiry.png) + +With message retention, shown at the top, a retention policy applied to all topics in a namespace dicates that some messages are durably stored in Pulsar even though they've already been acknowledged. Acknowledged messages that are not covered by the retention policy are deleted. Without a retention policy, *all* of the acknowledged messages would be deleted. + +With message expiry, shown at the bottom, some messages are deleted, even though they haven't been acknowledged, because they've expired according to the TTL applied to the namespace (for example because a TTL of 5 minutes has been applied and the messages haven't been acknowledged but are 10 minutes old). + +## Pulsar Functions + +For an in-depth look at Pulsar Functions, see the [Pulsar Functions overview](functions-overview.md). + +## Replication + +Pulsar enables messages to be produced and consumed in different geo-locations. For instance, your application may be publishing data in one region or market and you would like to process it for consumption in other regions or markets. [Geo-replication](administration-geo.md) in Pulsar enables you to do that. + +## Message deduplication + +Message **duplication** occurs when a message is [persisted](#persistent-storage) by Pulsar more than once. Message ***de*duplication** is an optional Pulsar feature that prevents unnecessary message duplication by processing each message only once, *even if the message is received more than once*. + +The following diagram illustrates what happens when message deduplication is disabled vs. enabled: + +![Pulsar message deduplication](/docs/assets/message-deduplication.png) + + +Message deduplication is disabled in the scenario shown at the top. Here, a producer publishes message 1 on a topic; the message reaches a Pulsar broker and is [persisted](#persistent-storage) to BookKeeper. The producer then sends message 1 again (in this case due to some retry logic), and the message is received by the broker and stored in BookKeeper again, which means that duplication has occurred. + +In the second scenario at the bottom, the producer publishes message 1, which is received by the broker and persisted, as in the first scenario. When the producer attempts to publish the message again, however, the broker knows that it has already seen message 1 and thus does not persist the message. + +> Message deduplication is handled at the namespace level. For more instructions, see the [message deduplication cookbook](cookbooks-deduplication.md). + + +### Producer idempotency + +The other available approach to message deduplication is to ensure that each message is *only produced once*. This approach is typically called **producer idempotency**. The drawback of this approach is that it defers the work of message deduplication to the application. In Pulsar, this is handled at the {% popover broker %} level, which means that you don't need to modify your Pulsar client code. Instead, you only need to make administrative changes (see the [Managing message deduplication](cookbooks-deduplication.md) cookbook for a guide). + +### Deduplication and effectively-once semantics + +Message deduplication makes Pulsar an ideal messaging system to be used in conjunction with stream processing engines (SPEs) and other systems seeking to provide [effectively-once](https://streaml.io/blog/exactly-once) processing semantics. Messaging systems that don't offer automatic message deduplication require the SPE or other system to guarantee deduplication, which means that strict message ordering comes at the cost of burdening the application with the responsibility of deduplication. With Pulsar, strict ordering guarantees come at no application-level cost. + +> More in-depth information can be found in [this post](https://streaml.io/blog/pulsar-effectively-once/) on the [Streamlio blog](https://streaml.io/blog) + + +## Multi-tenancy + +Pulsar was created from the ground up as a multi-tenant system. To support multi-tenancy, Pulsar has a concept of tenants. Tenants can be spread across clusters and can each have their own [authentication and authorization](administration-auth.md) scheme applied to them. They are also the administrative unit at which storage quotas, [message TTL](cookbooks-retention-expiry.md#time-to-live-ttl), and isolation policies can be managed. + +The multi-tenant nature of Pulsar is reflected mostly visibly in topic URLs, which have this structure: + +```http +persistent://tenant/namespace/topic +``` + +As you can see, the tenant is the most basic unit of categorization for topics (more fundamental than the namespace and topic name). + +### Tenants and namespaces + +{% include explanations/tenants-namespaces.md %} + +Pulsar was designed from the ground up to be a multi-tenant system. In Pulsar, tenants are the highest administrative unit within a Pulsar instance. + +### Tenants + +To each property in a Pulsar instance you can assign: + +* An [authorization](administration-auth.md#authorization) scheme +* The set of {% popover clusters %} to which the tenant's configuration applies + +### Namespaces + +Tenants and namespaces are two key concepts of Pulsar to support multi-tenancy. + +* Pulsar is provisioned for specified tenants with appropriate capacity allocated to the tenant. +* A namespace is the administrative unit nomenclature within a tenant. The configuration policies set on a namespace apply to all the topics created in that namespace. A tenant may create multiple namespaces via self-administration using the REST API and the [`pulsar-admin`](reference-pulsar-admin.md) CLI tool. For instance, a tenant with different applications can create a separate namespace for each application. + +Names for topics in the same namespace will look like this: + +```http +persistent://tenant/app1/topic-1 + +persistent://tenant/app1/topic-2 + +persistent://tenant/app1/topic-3 +``` + + +## Authentication and Authorization + +Pulsar supports a pluggable [authentication](administration-auth.md) mechanism which can be configured at broker and it also supports authorization to identify client and its access rights on topics and tenants. + +## Client interface + +Pulsar exposes a client API with language bindings for [Java](client-libraries-java.md) and [C++](client-libraries-cpp.md). The client API optimizes and encapsulates Pulsar's client-broker communication protocol and exposes a simple and intuitive API for use by applications. + +Under the hood, the current official Pulsar client libraries support transparent reconnection and/or connection failover to brokers, queuing of messages until acknowledged by the broker, and heuristics such as connection retries with backoff. + +> #### Custom client libraries +> If you'd like to create your own client library, we recommend consulting the documentation on Pulsar's custom [binary protocol](developing-binary-protocol.md) + + +### Client setup phase + +When an application wants to create a producer/consumer, the Pulsar client library will initiate a setup phase that is composed of two steps: + +1. The client will attempt to determine the owner of the topic by sending an HTTP lookup request to the broker. The request could reach one of the active brokers which, by looking at the (cached) zookeeper metadata will know who is serving the topic or, in case nobody is serving it, will try to assign it to the least loaded broker. +1. Once the client library has the broker address, it will create a TCP connection (or reuse an existing connection from the pool) and authenticate it. Within this connection, client and broker exchange binary commands from a custom protocol. At this point the client will send a command to create producer/consumer to the broker, which will comply after having validated the authorization policy. + +Whenever the TCP connection breaks, the client will immediately re-initiate this setup phase and will keep trying with exponential backoff to re-establish the producer or consumer until the operation succeeds. + +## Pulsar proxy + +One way for Pulsar clients to interact with a Pulsar [cluster](#clusters) is by connecting to Pulsar message [brokers](#brokers) directly. In some cases, however, this kind of direct connection is either infeasible or undesirable because the client doesn't have direct access to broker addresses. If you're running Pulsar in a cloud environment or on [Kubernetes](https://kubernetes.io) or an analogous platform, for example, then direct client connections to brokers are likely not possible. + +The **Pulsar proxy** provides a solution to this problem by acting as a single gateway for all of the brokers in a cluster. If you run the Pulsar proxy (which, again, is optional), all client connections with the Pulsar cluster will flow through the proxy rather than communicating with brokers. + +> For the sake of performance and fault tolerance, you can run as many instances of the Pulsar proxy as you'd like. + +Architecturally, the Pulsar proxy gets all the information it requires from ZooKeeper. When starting the proxy on a machine, you only need to provide ZooKeeper connection strings for the cluster-specific and instance-wide configuration store clusters. Here's an example: + +```bash +$ bin/pulsar proxy \ + --zookeeper-servers zk-0,zk-1,zk-2 \ + --configuration-store-servers zk-0,zk-1,zk-2 +``` + +> #### Pulsar proxy docs +> For documentation on using the Pulsar proxy, see the [Pulsar proxy admin documentation](administration-proxy.md). + + +Some important things to know about the Pulsar proxy: + +* Connecting clients don't need to provide *any* specific configuration to use the Pulsar proxy. You won't need to update the client configuration for existing applications beyond updating the IP used for the service URL (for example if you're running a load balancer over the Pulsar proxy). +* [TLS encryption and authentication](administration-auth.md#tls-client-auth) is supported by the Pulsar proxy + +## Service discovery + +[Clients](getting-started-clients.md) connecting to Pulsar brokers need to be able to communicate with an entire Pulsar instance using a single URL. Pulsar provides a built-in service discovery mechanism that you can set up using the instructions in the [Deploying a Pulsar instance](deploy-bare-metal.md#service-discovery-setup) guide. + +You can use your own service discovery system if you'd like. If you use your own system, there is just one requirement: when a client performs an HTTP request to an endpoint, such as `http://pulsar.us-west.example.com:8080`, the client needs to be redirected to *some* active broker in the desired cluster, whether via DNS, an HTTP or IP redirect, or some other means. + +The diagram below illustrates Pulsar service discovery: + +![alt-text](/docs/assets/pulsar-service-discovery.png) + +In this diagram, the Pulsar cluster is addressable via a single DNS name: `pulsar-cluster.acme.com`. A [Python client](client-libraries-python.md), for example, could access this Pulsar cluster like this: + +```python +from pulsar import Client + +client = Client('pulsar://pulsar-cluster.acme.com:6650') +``` + +## Reader interface + +In Pulsar, the "standard" [consumer interface](#consumers) involves using consumers to listen on {% popover topics %}, process incoming messages, and finally acknowledge those messages when they've been processed. Whenever a consumer connects to a topic, it automatically begins reading from the earliest un-acked message onward because the topic's cursor is automatically managed by Pulsar. + +The **reader interface** for Pulsar enables applications to manually manage cursors. When you use a reader to connect to a topic---rather than a consumer---you need to specify *which* message the reader begins reading from when it connects to a topic. When connecting to a topic, the reader interface enables you to begin with: + +* The **earliest** available message in the topic +* The **latest** available message in the topic +* Some other message between the earliest and the latest. If you select this option, you'll need to explicitly provide a message ID. Your application will be responsible for "knowing" this message ID in advance, perhaps fetching it from a persistent data store or cache. + +The reader interface is helpful for use cases like using Pulsar to provide [effectively-once](https://streaml.io/blog/exactly-once/) processing semantics for a stream processing system. For this use case, it's essential that the stream processing system be able to "rewind" topics to a specific message and begin reading there. The reader interface provides Pulsar clients with the low-level abstraction necessary to "manually position" themselves within a topic. + +![The Pulsar consumer and reader interfaces](/docs/assets/pulsar-reader-consumer-interfaces.png) + +> ### Non-partitioned topics only +> The reader interface for Pulsar cannot currently be used with [partitioned topics](#partitioned-topics). + +Here's a Java example that begins reading from the earliest available message on a topic: + +```java +import org.apache.pulsar.client.api.Message; +import org.apache.pulsar.client.api.MessageId; +import org.apache.pulsar.client.api.Reader; + +// Create a reader on a topic and for a specific message (and onward) +Reader reader = pulsarClient.newReader() + .topic("reader-api-test") + .startMessageId(MessageId.earliest) + .create(); + +while (true) { + Message message = reader.readNext(); + + // Process the message +} +``` + +To create a reader that will read from the latest available message: + +```java +Reader reader = pulsarClient.newReader() + .topic(topic) + .startMessageId(MessageId.latest) + .create(); +``` + +To create a reader that will read from some message between earliest and latest: + +```java +byte[] msgIdBytes = // Some byte array +MessageId id = MessageId.fromByteArray(msgIdBytes); +Reader reader = pulsarClient.newReader() + .topic(topic) + .startMessageId(id) + .create(); +``` + +## Topic compaction + +Pulsar was built with highly scalable [persistent storage](#persistent-storage) of message data as a primary objective. Pulsar topics enable you to persistently store as many unacknowledged messages as you need while preserving message ordering. By default, Pulsar stores *all* unacknowledged/unprocessed messages produced on a topic. Accumulating many unacknowledged messages on a topic is necessary for many Pulsar use cases but it can also be very time intensive for Pulsar consumers to "rewind" through the entire log of messages. + +> For a more practical guide to topic compaction, see the [Topic compaction cookbook](cookbooks-compaction.md). + +For some use cases consumers don't need a complete "image" of the topic log. They may only need a few values to construct a more "shallow" image of the log, perhaps even just the most recent value. For these kinds of use cases Pulsar offers **topic compaction**. When you run compaction on a topic, Pulsar goes through a topic's backlog and removes messages that are *obscured* by later messages, i.e. it goes through the topic on a per-key basis and leaves only the most recent message associated with that key. + +Pulsar's topic compaction feature: + +* Allows for faster "rewind" through topic logs +* Applies only to [persistent topics](#persistent-storage) +* Triggered automatically when the backlog reaches a certain size or can be triggered manually via the command line. See the [Topic compaction cookbook](cookbooks-compaction.md) +* Is conceptually and operationally distinct from [retention and expiry](#message-retention-and-expiry). Topic compaction *does*, however, respect retention. If retention has removed a message from the message backlog of a topic, the message will also not be readable from the compacted topic ledger. + +> #### Topic compaction example: the stock ticker +> An example use case for a compacted Pulsar topic would be a stock ticker topic. On a stock ticker topic, each message bears a timestamped dollar value for stocks for purchase (with the message key holding the stock symbol, e.g. `AAPL` or `GOOG`). With a stock ticker you may care only about the most recent value(s) of the stock and have no interest in historical data (i.e. you don't need to construct a complete image of the topic's sequence of messages per key). Compaction would be highly beneficial in this case because it would keep consumers from needing to rewind through obscured messages. + + +### How topic compaction works + +When topic compaction is triggered [via the CLI](cookbooks-compaction.md), Pulsar will iterate over the entire topic from beginning to end. For each key that it encounters the compaction routine will keep a record of the latest occurrence of that key. + +After that, the broker will create a new [BookKeeper ledger](#ledgers) and make a second iteration through each message on the topic. For each message, if the key matches the latest occurrence of that key, then the key's data payload, message ID, and metadata will be written to the newly created ledger. If the key doesn't match the latest then the message will be skipped and left alone. If any given message has an empty payload, it will be skipped and considered deleted (akin to the concept of [tombstones](https://en.wikipedia.org/wiki/Tombstone_(data_store)) in key-value databases). At the end of this second iteration through the topic, the newly created BookKeeper ledger is closed and two things are written to the topic's metadata: the ID of the BookKeeper ledger and the message ID of the last compacted message (this is known as the **compaction horizon** of the topic). Once this metadata is written compaction is complete. + +After the initial compaction operation, the Pulsar {% popover broker %} that owns the topic is notified whenever any future changes are made to the compaction horizon and compacted backlog. When such changes occur: + +* Clients (consumers and readers) that have read compacted enabled will attempt to read messages from a topic and either: + * Read from the topic like normal (if the message ID is greater than or equal to the compaction horizon) or + * Read beginning at the compaction horizon (if the message ID is lower than the compaction horizon) + +## Tiered Storage + +Pulsar's segment oriented architecture allows for topic backlogs to grow very large, effectively without limit. However, this can become expensive over time. + +One way to alleviate this cost is to use Tiered Storage. With tiered storage, older messages in the backlog can be moved from bookkeeper to a cheaper storage mechanism, while still allowing clients to access the backlog as if nothing had changed. + +![Tiered Storage](/docs/assets/pulsar-tiered-storage.png) + +> Data written to bookkeeper is replicated to 3 physical machines by default. However, once a segment is sealed in bookkeeper is becomes immutable and can be copied to long term storage. Long term storage can achieve cost savings by using mechanisms such as [Reed-Solomon error correction](https://en.wikipedia.org/wiki/Reed%E2%80%93Solomon_error_correction) to require fewer physical copies of data. + +Pulsar currently supports S3 as a long term store. Offloading to S3 triggered via a Rest API or command line interface. The user passes in the amount of topic data they wish to retain on bookkeeper, and the broker will copy the backlog data to S3. The original data will then be deleted from bookkeeper after a configured delay (4 hours by default). + +> For a guide for setting up tiered storage, see the [Tiered storage cookbook](cookbooks-tiered-storage.md). + + +## Pulsar IO + +Messaging systems are most powerful when you can easily use them in conjunction with external systems like databases and other messaging systems. **Pulsar IO** is a feature of Pulsar that enables you to easily create, deploy, and manage Pulsar **connectors** that interact with external systems, such as [Apache Cassandra](https://cassandra.apache.org), [Aerospike](https://www.aerospike.com), and many others. + +> #### Pulsar IO and Pulsar Functions +> Under the hood, Pulsar IO connectors are specialized [Pulsar Functions](#pulsar-functions) purpose-built to interface with external systems. The [administrative interface](io-overview.md) for Pulsar IO is, in fact, quite similar to that of Pulsar Functions. + +### Sources and sinks + +Pulsar IO connectors come in two types: + +* **Sources** feed data *into* Pulsar from other systems. Common sources include other messaging systems and "firehose"-style data pipeline APIs. +* **Sinks** are fed data *from* Pulsar. Common sinks include other messaging systems and SQL and NoSQL databases. + +This diagram illustrates the relationship between sources, sinks, and Pulsar: + +![Pulsar IO diagram](/docs/assets/pulsar-io.png "Pulsar IO connectors (sources and sinks)") + +### Working with connectors + +Pulsar IO connectors can be managed via the [`pulsar-admin`](reference-pulsar-admin.md) CLI tool, in particular the [`source`](reference-pulsar-admin.md#source) and [`sink`](reference-pulsar-admin.md#cooks) commands. + +> For a guide to managing connectors in your Pulsar installation, see the [Pulsar IO cookbook](io-overview.md#managing-connectors). + +The following connectors are currently available for Pulsar: + +|Name|Java Class| +|---|---| +|[Aerospike sink](https://www.aerospike.com/)|[`org.apache.pulsar.io.aerospike.AerospikeSink`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-io/aerospike/src/main/java/org/apache/pulsar/io/aerospike/AerospikeAbstractSink.java)| +|[Cassandra sink](https://cassandra.apache.org)|[`org.apache.pulsar.io.cassandra.CassandraSink`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-io/cassandra/src/main/java/org/apache/pulsar/io/cassandra/CassandraAbstractSink.java)| +|[Kafka source](https://kafka.apache.org)|[`org.apache.pulsar.io.kafka.KafkaSource`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSource.java)| +|[Kafka sink](https://kafka.apache.org)|[`org.apache.pulsar.io.kafka.KafkaSink`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSink.java)| +|[RabbitMQ source](https://www.rabbitmq.com)|[`org.apache.pulsar.io.rabbitmq.RabbitMQSource`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-io/rabbitmq/src/main/java/org/apache/pulsar/io/rabbitmq/RabbitMQSource.java)| +|[Twitter Firehose source](https://developer.twitter.com/en/docs)|[org.apache.pulsar.io.twitter.TwitterFireHose](https://github.com/apache/incubator-pulsar/blob/master/pulsar-io/twitter/src/main/java/org/apache/pulsar/io/twitter/TwitterFireHose.java)| + + +## Schema registry + +Type safety is extremely important in any application built around a message bus like Pulsar. Producers and consumers need some kind of mechanism for coordinating types at the topic level lest a wide variety of potential problems arise (for example serialization and deserialization issues). Applications typically adopt one of two basic approaches to type safety in messaging: + +1. A "client-side" approach in which message producers and consumers are responsible for not only serializing and deserializing messages (which consist of raw bytes) but also "knowing" which types are being transmitted via which topics. If a producer is sending temperature sensor data on the topic `topic-1`, consumers of that topic will run into trouble if they attempt to parse that data as, say, moisture sensor readings. +1. A "server-side" approach in which producers and consumers inform the system which data types can be transmitted via the topic. With this approach, the messaging system enforces type safety and ensures that producers and consumers remain synced. + +Both approaches are available in Pulsar, and you're free to adopt one or the other or to mix and match on a per-topic basis. + +1. For the "client-side" approach, producers and consumers can send and receive messages consisting of raw byte arrays and leave all type safety enforcement to the application on an "out-of-band" basis. +1. For the "server-side" approach, Pulsar has a built-in **schema registry** that enables clients to upload data schemas on a per-topic basis. Those schemas dictate which data types are recognized as valid for that topic. + +> The Pulsar schema registry is currently available only for the [Java client](client-libraries-java.md). + +### Basic architecture + +In Pulsar, schemas are uploaded to, fetched from, and update via Pulsar's [REST API](reference-rest-api.md). + +> #### Other schema registry backends +> Out of the box, Pulsar uses the [Apache BookKeeper](#persistent-storage) log storage system for schema storage. You can, however, use different backends if you wish. Documentation for custom schema storage logic is coming soon. + +### How schemas work + +Pulsar schemas are applied and enforced *at the topic level* (schemas cannot be applied at the namespace or tenant level). Producers and consumers upload schemas to Pulsar brokers. + +Pulsar schemas are fairly simple data structures that consist of: + +* A **name**. In Pulsar, a schema's name is the topic to which the schema is applied. +* A **payload**, which is a binary representation of the schema +* A schema [**type**](#schema-types) +* User-defined **properties** as a string/string map. Usage of properties is wholly application specific. Possible properties might be the Git hash associated with a schema, an environment like `dev` or `prod`, etc. + +### Schema versions + +In order to illustrate how schema versioning works, let's walk through an example. Imagine that the Pulsar [Java client](client-libraries-java.md) created using the code below attempts to connect to Pulsar and begin sending messages: + +```java +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); + +Producer producer = client.newProducer(JSONSchema.of(SensorReading.class)) + .topic("sensor-data") + .sendTimeout(3, TimeUnit.SECONDS) + .create(); +``` + +The table below lists the possible scenarios when this connection attempt occurs and what will happen in light of each scenario: + +Scenario | What happens +:--------|:------------ +No schema exists for the topic | The producer is created using the given schema. The schema is transmitted to the broker and stored (since no existing schema is "compatible" with the `SensorReading` schema). Any consumer created using the same schema/topic can consume messages from the `sensor-data` topic. +A schema already exists; the producer connects using the same schema that's already stored | The schema is transmitted to the Pulsar broker. The broker determines that the schema is compatible. The broker attempts to store the schema in [BookKeeper](#persistent-storage) but then determines that it's already stored, so it's then used to tag produced messages. +A schema already exists; the producer connects using a new schema that is compatible | The producer transmits the schema to the broker. The broker determines that the schema is compatible and stores the new schema as the current version (with a new version number). + +> Schemas are versioned in succession. Schema storage happens in the broker that handles the associated topic so that version assignments can be made. Once a version is assigned/fetched to/for a schema, all subsequent messages produced by that producer are tagged with the appropriate version. + + +### Supported schema formats + +The following formats are supported by the Pulsar schema registry: + +* None. If no schema is specified for a topic, producers and consumers will handle raw bytes. +* `String` (used for UTF-8-encoded strings) +* [JSON](https://www.json.org/) + +For usage instructions, see the documentation for your preferred client library: + +* [Java](client-libraries-java.md#schemas) + +> Support for other schema formats will be added in future releases of Pulsar. diff --git a/site2/docs/getting-started-docker.md b/site2/docs/getting-started-docker.md new file mode 100644 index 0000000000000000000000000000000000000000..e53b9095ed3bc5c0128588e05e6a41638c9174ad --- /dev/null +++ b/site2/docs/getting-started-docker.md @@ -0,0 +1,155 @@ +--- +id: standalone-docker +title: Start a standalone cluster with Docker +sidebar_label: Pulsar in Docker +--- + +For the purposes of local development and testing, you can run Pulsar in standalone +mode on your own machine within a Docker container. + +If you don't have Docker installed, you can download the [Community edition](https://www.docker.com/community-edition) +and follow the instructions for your OS. + +## Starting Pulsar inside Docker + +```shell +$ docker run -it \ + -p 6650:6650 \ + -p 8080:8080 \ + -v $PWD/data:/pulsar/data \ + apachepulsar/pulsar:{{site.current_version}} \ + bin/pulsar standalone +``` + +A few things to note about this command: + * `-v $PWD/data:/pulsar/data`: This will make the process inside the container to store the + data and metadata in the filesystem outside the container, in order to not start "fresh" every + time the container is restarted. + +If Pulsar has been successfully started, you should see `INFO`-level log messages like this: + +``` +2017-08-09 22:34:04,030 - INFO - [main:WebService@213] - Web Service started at http://127.0.0.1:8080 +2017-08-09 22:34:04,038 - INFO - [main:PulsarService@335] - messaging service is ready, bootstrap service on port=8080, broker url=pulsar://127.0.0.1:6650, cluster=standalone, configs=org.apache.pulsar.broker.ServiceConfiguration@4db60246 +... +``` + + +> #### Automatically created namespace +> When you start a local standalone cluster, Pulsar will automatically create a `public/default` +namespace that you can use for development purposes. All Pulsar topics are managed within namespaces. +For more info, see [Topics](getting-started-concepts-and-architecture.md#Topics). + + +## Start publishing and consuming messages + +Pulsar currently offers client libraries for [Java](client-libraries-java.md), [Python](client-libraries-python.md), +and [C++](client-libraries-cpp.md). If you're running a local standalone cluster, you can +use one of these root URLs for interacting with your cluster: + +* `pulsar://localhost:6650` +* `http://localhost:8080` + +Here's an example that lets you quickly get started with Pulsar by using the [Python](client-libraries-python.md) +client API. + +You can install the Pulsar Python client library directly from [PyPI](https://pypi.org/project/pulsar-client/): + +```shell +$ pip install pulsar-client +``` + +First create a consumer and subscribe to the topic: + +```python +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') +consumer = client.subscribe('my-topic', + subscription_name='my-sub') + +while True: + msg = consumer.receive() + print("Received message: '%s'" % msg.data()) + consumer.acknowledge(msg) + +client.close() +``` + +Now we can start a producer to send some test messages: + +```python +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') +producer = client.create_producer('my-topic') + +for i in range(10): + producer.send(('hello-pulsar-%d' % i).encode('utf-8')) + +client.close() +``` + + +## Get the topic statistics + +In Pulsar you can use REST, Java, or command-line tools to control every aspect of the system. +You can find detailed documentation of all the APIs in the [Admin API Overview](admin-api-overview.md). + +In the simplest example, you can use curl to probe the stats for a particular topic: + +```shell +$ curl http://localhost:8080/admin/v2/persistent/public/default/my-topic/stats | python -m json.tool +``` + +The output will be something like this: + +```json +{ + "averageMsgSize": 0.0, + "msgRateIn": 0.0, + "msgRateOut": 0.0, + "msgThroughputIn": 0.0, + "msgThroughputOut": 0.0, + "publishers": [ + { + "address": "/172.17.0.1:35048", + "averageMsgSize": 0.0, + "clientVersion": "1.19.0-incubating", + "connectedSince": "2017-08-09 20:59:34.621+0000", + "msgRateIn": 0.0, + "msgThroughputIn": 0.0, + "producerId": 0, + "producerName": "standalone-0-1" + } + ], + "replication": {}, + "storageSize": 16, + "subscriptions": { + "my-sub": { + "blockedSubscriptionOnUnackedMsgs": false, + "consumers": [ + { + "address": "/172.17.0.1:35064", + "availablePermits": 996, + "blockedConsumerOnUnackedMsgs": false, + "clientVersion": "1.19.0-incubating", + "connectedSince": "2017-08-09 21:05:39.222+0000", + "consumerName": "166111", + "msgRateOut": 0.0, + "msgRateRedeliver": 0.0, + "msgThroughputOut": 0.0, + "unackedMessages": 0 + } + ], + "msgBacklog": 0, + "msgRateExpired": 0.0, + "msgRateOut": 0.0, + "msgRateRedeliver": 0.0, + "msgThroughputOut": 0.0, + "type": "Exclusive", + "unackedMessages": 0 + } + } +} +``` diff --git a/site2/docs/getting-started-pulsar.md b/site2/docs/getting-started-pulsar.md new file mode 100644 index 0000000000000000000000000000000000000000..0bb2299f6d23cdc0a77bb84c5649c4aaf384cc0f --- /dev/null +++ b/site2/docs/getting-started-pulsar.md @@ -0,0 +1,65 @@ +--- +id: pulsar-2.0 +title: Pulsar 2.0 +sidebar_label: Pulsar 2.0 +--- + +Pulsar 2.0 is a major new release for Pulsar that brings some bold changes to the platform, including [simplified topic names](#topic-names), the addition of the [Pulsar Functions](functions-overview.md) feature, some terminology changes, and more. + +## New features in Pulsar 2.0 + +Feature | Description +:-------|:----------- +[Pulsar Functions](functions-overview.md) | A lightweight compute option for Pulsar + +## Major changes + +There are a few major changes that you should be aware of, as they may significantly impact your day-to-day usage. + +### Properties versus tenants + +Previously, Pulsar had a concept of properties. A property is essentially the exact same thing as a tenant, so the "property" terminology has been removed in version 2.0. The [`pulsar-admin properties`](reference-pulsar-admin.md#pulsar-admin) command-line interface, for example, has been replaced with the [`pulsar-admin tenants`](reference-pulsar-admin.md#pulsar-admin-tenants) interface. In some cases the properties terminology is still used but is now considered deprecated and will be removed entirely in a future release. + +### Topic names + +Prior to version 2.0, *all* Pulsar topics had the following form: + +```http +{persistent|non-persistent}://property/cluster/namespace/topic +``` +Two important changes have been made in Pulsar 2.0: + +* There is no longer a [cluster component](#no-cluster) +* Properties have been [renamed to tenants](#tenants) +* You can use a [flexible](#flexible-topic-naming) naming system to shorten many topic names + +#### No cluster component + +The cluster component has been removed from topic names. Thus, all topic names now have the following form: + +```http +{persistent|non-persistent}://tenant/namespace/topic +``` + +> Existing topics that use the legacy name format will continue to work without any change, and there are no plans to change that. + + +#### Flexible topic naming + +All topic names in Pulsar 2.0 internally have the form shown [above](#no-cluster-component) but you can now use shorthand names in many cases (for the sake of simplicity). The flexible naming system stems from the fact that there is now a default topic type, tenant, and namespace: + +Topic aspect | Default +:------------|:------- +topic type | `persistent` +tenant | `public` +namespace | `default` + +The table below shows some example topic name translations that use implicit defaults: + +Input topic name | Translated topic name +:----------------|:--------------------- +`my-topic` | `persistent://public/default/my-topic` +`my-tenant/my-namespace/my-topic` | `persistent://my-tenant/my-namespace/my-topic` + +> For [non-persistent topics](getting-started-concepts-and-architecture.md#non-persistent-topics) you'll need to continue to specify the entire topic name, as the default-based rules for persistent topic names don't apply. Thus you cannot use a shorthand name like `non-persistent://my-topic` and would need to use `non-persistent://public/default/my-topic` instead + diff --git a/site2/docs/getting-started-standalone.md b/site2/docs/getting-started-standalone.md new file mode 100644 index 0000000000000000000000000000000000000000..523ae18fbc7b67b624968f4dd9186d678c00eeea --- /dev/null +++ b/site2/docs/getting-started-standalone.md @@ -0,0 +1,136 @@ +--- +id: standalone +title: Setting up a local standalone cluster +sidebar_label: Run Pulsar locally +--- + +For the purposes of local development and testing, you can run Pulsar in standalone mode on your own machine. Standalone mode includes a Pulsar broker as well as the necessary ZooKeeper and BookKeeper components running inside of a single Java Virtual Machine (JVM) process. + +> #### Pulsar in production? +> If you're looking to run a full production Pulsar installation, see the [Deploying a Pulsar instance](deploy-bare-metal.md) guide. + +## System requirements + +Pulsar is currently available for **MacOS** and **Linux**. In order to use Pulsar, you'll need to install [Java 8](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html). + + +## Installing Pulsar + +To get started running Pulsar, download a binary tarball release in one of the following ways: + +* by clicking the link below and downloading the release from an Apache mirror: + + * Pulsar pulsar:version binary release + +* from the Pulsar [downloads page](pulsar:download_page_url) +* from the Pulsar [releases page](https://github.com/apache/incubator-pulsar/releases/latest) +* using [wget](https://www.gnu.org/software/wget): + + ```shell + $ wget pulsar:binary_release_url + ``` + +Once the tarball is downloaded, untar it and `cd` into the resulting directory: + +```bash +$ tar xvfz apache-pulsar-pulsar:version-bin.tar.gz +$ cd apache-pulsar-pulsar:version +``` + +## What your package contains + +The Pulsar binary package initially contains the following directories: + +Directory | Contains +:---------|:-------- +`bin` | Pulsar's command-line tools, such as [`pulsar`](reference-cli-tools.md#pulsar) and [`pulsar-admin`](reference-pulsar-admin.md) +`conf` | Configuration files for Pulsar, including for [broker configuration](reference-configuration.md#broker), [ZooKeeper configuration](reference-configuration.md#zookeeper), and more +`examples` | A Java JAR file containing example [Pulsar Functions](functions-overview.md) +`lib` | The [JAR](https://en.wikipedia.org/wiki/JAR_(file_format)) files used by Pulsar +`licenses` | License files, in `.txt` form, for various components of the Pulsar [codebase](developing-codebase.md) + +These directories will be created once you begin running Pulsar: + +Directory | Contains +:---------|:-------- +`data` | The data storage directory used by ZooKeeper and BookKeeper +`instances` | Artifacts created for [Pulsar Functions](functions-overview.md) +`logs` | Logs created by the installation + + + + +## Starting the cluster + +Once you have an up-to-date local copy of the release, you can start up a local cluster using the [`pulsar`](reference-cli-tools.md#pulsar) command, which is stored in the `bin` directory, and specifying that you want to start up Pulsar in standalone mode: + +```bash +$ bin/pulsar standalone +``` + +If Pulsar has been successfully started, you should see `INFO`-level log messages like this: + +```bash +2017-06-01 14:46:29,192 - INFO - [main:WebSocketService@95] - Global Zookeeper cache started +2017-06-01 14:46:29,192 - INFO - [main:AuthenticationService@61] - Authentication is disabled +2017-06-01 14:46:29,192 - INFO - [main:WebSocketService@108] - Pulsar WebSocket Service started +``` + +> #### Automatically created namespace +> When you start a local standalone cluster, Pulsar will automatically create a `public/default` [namespace](getting-started-concepts-and-architecture.md#namespace) that you can use for development purposes. All Pulsar topics are managed within namespaces. For more info, see [Topics](getting-started-concepts-and-architecture.md#topics). + + +## Testing your cluster setup + +Pulsar provides a CLI tool called [`pulsar-client`](reference-cli-tools.md#pulsar-client) that enables you to do things like send messages to a Pulsar topic in a running cluster. This command will send a simple message saying `hello-pulsar` to the `my-topic` topic: + +```bash +$ bin/pulsar-client produce my-topic --messages "hello-pulsar" +``` + +If the message has been successfully published to the topic, you should see a confirmation like this in the `pulsar-client` logs: + +``` +13:09:39.356 [main] INFO org.apache.pulsar.client.cli.PulsarClientTool - 1 messages successfully produced +``` + + +> #### No need to explicitly create new topics +> You may have noticed that we did not explicitly create the `my-topic` topic to which we sent the `hello-pulsar` message. If you attempt to write a message to a topic that does not yet exist, Pulsar will automatically create that topic for you. + +## Using Pulsar clients locally + +Pulsar currently offers client libraries for [Java](client-libraries-java.md), [Python](client-libraries-python.md), and [C++](client-libraries-cpp.md). If you're running a local standalone cluster, you can use one of these root URLs for interacting with your cluster: + +* `http://localhost:8080` +* `pulsar://localhost:6650` + +Here's an example producer for a Pulsar topic using the [Java](client-libraries-java.md) client: + +```java +String localClusterUrl = "pulsar://localhost:6650"; + +PulsarClient client = PulsarClient.builder().serviceURL(localClusterUrl).build(); +Producer producer = client.newProducer().topic("my-topic").create(); +``` + +Here's an example [Python](client-libraries-python.md) producer: + +```python +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') +producer = client.create_producer('my-topic') +``` + +Finally, here's an example [C++](client-libraries-cpp.md) producer: + +```cpp +Client client("pulsar://localhost:6650"); +Producer producer; +Result result = client.createProducer("my-topic", producer); +if (result != ResultOk) { + LOG_ERROR("Error creating producer: " << result); + return -1; +} +``` diff --git a/site2/docs/io-overview.md b/site2/docs/io-overview.md new file mode 100644 index 0000000000000000000000000000000000000000..781fb0e0ca4f3310146d1fa0c597032d0101279b --- /dev/null +++ b/site2/docs/io-overview.md @@ -0,0 +1,38 @@ +--- +id: io-overview +title: Pulsar IO Overview +sidebar_label: Overview +--- + +Messaging systems are most powerful when you can easily use them in conjunction with external systems like databases and other messaging systems. **Pulsar IO** is a feature of Pulsar that enables you to easily create, deploy, and manage Pulsar **connectors** that interact with external systems, such as [Apache Cassandra](https://cassandra.apache.org), [Aerospike](https://www.aerospike.com), and many others. + +> #### Pulsar IO and Pulsar Functions +> Under the hood, Pulsar IO connectors are specialized [Pulsar Functions](functions-overview.md) purpose-built to interface with external systems. The [administrative interface](io-quickstart.md) for Pulsar IO is, in fact, quite similar to that of Pulsar Functions." + +## Sources and sinks + +Pulsar IO connectors come in two types: + +* **Sources** feed data *into* Pulsar from other systems. Common sources include other messaging systems and "firehose"-style data pipeline APIs. +* **Sinks** are fed data *from* Pulsar. Common sinks include other messaging systems and SQL and NoSQL databases. + +This diagram illustrates the relationship between sources, sinks, and Pulsar: + +![Pulsar IO diagram](/docs/assets/pulsar-io.png "Pulsar IO connectors (sources and sinks") + +## Working with connectors + +Pulsar IO connectors can be managed via the [`pulsar-admin`](reference-pulsar-admin.md) CLI tool, in particular the [`source`](reference-pulsar-admin.md#source) and [`sink`](reference-pulsar-admin.md#sink) commands. + +> For a guide to managing connectors in your Pulsar installation, see the [Getting started with Pulsar IO](io-quickstart.md) + +The following connectors are currently available for Pulsar: + +|Name|Java Class| +|---|---| +|[Aerospike sink](https://www.aerospike.com/)|[`org.apache.pulsar.io.aerospike.AerospikeSink`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-io/aerospike/src/main/java/org/apache/pulsar/io/aerospike/AerospikeStringSink.java)| +|[Cassandra sink](https://cassandra.apache.org)|[`org.apache.pulsar.io.cassandra.CassandraSink`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-io/cassandra/src/main/java/org/apache/pulsar/io/cassandra/CassandraStringSink.java)| +|[Kafka source](https://kafka.apache.org)|[`org.apache.pulsar.io.kafka.KafkaSource`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaStringSource.java)| +|[Kafka sink](https://kafka.apache.org)|[`org.apache.pulsar.io.kafka.KafkaSink`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaStringSink.java)| +|[RabbitMQ source](https://www.rabbitmq.com)|[`org.apache.pulsar.io.rabbitmq.RabbitMQSource`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-io/rabbitmq/src/main/java/org/apache/pulsar/io/rabbitmq/RabbitMQSource.java)| +|[Twitter Firehose source](https://developer.twitter.com/en/docs)|[org.apache.pulsar.io.twitter.TwitterFireHose](https://github.com/apache/incubator-pulsar/blob/master/pulsar-io/twitter/src/main/java/org/apache/pulsar/io/twitter/TwitterFireHose.java)| diff --git a/site2/docs/io-quickstart.md b/site2/docs/io-quickstart.md new file mode 100644 index 0000000000000000000000000000000000000000..c270fc1a96ee38c961e97b10a99711ed4b8495cc --- /dev/null +++ b/site2/docs/io-quickstart.md @@ -0,0 +1,72 @@ +--- +id: io-quickstart +title: Pulsar IO Overview +sidebar_label: Getting started +--- + +[Pulsar IO](getting-started-concepts-and-architecture.md#pulsar-io) is a feature of Pulsar that enables you to easily create and manage **connectors** that interface with external systems, such as databases and other messaging systems. + +## Setup + +In order to run Pulsar IO connectors, you'll need to have a binary distribution of pulsar locally. + +## Managing connectors + +Pulsar connectors can be managed using the [`source`](reference-pulsar-admin.md#source) and [`sink`](reference-pulsar-admin.md#sink) commands of the [`pulsar-admin`](reference-pulsar-admin.md) CLI tool. + +### Running sources + +You can use the [`create`](reference-pulsar-admin.md#source-create) + +You can submit a sink to be run in an existing Pulsar cluster using a command of this form: + +```bash +$ ./bin/pulsar-admin sink create --className --jar --tenant test --namespace --name --inputs +``` + +Here’s an example command: + +```bash +bin/pulsar-admin source create --className org.apache.pulsar.io.twitter.TwitterFireHose --jar ~/application.jar --tenant test --namespace ns1 --name twitter-source --destinationTopicName twitter_data +``` + +Instead of submitting a source to run on an existing Pulsar cluster, you alternatively can run a source as a process on your local machine: + +```bash +bin/pulsar-admin source localrun --className org.apache.pulsar.io.twitter.TwitterFireHose --jar ~/application.jar --tenant test --namespace ns1 --name twitter-source --destinationTopicName twitter_data +``` + +### Running Sinks + +You can submit a sink to be run in an existing Pulsar cluster using a command of this form: + +```bash +./bin/pulsar-admin sink create --className --jar --tenant test --namespace --name --inputs +``` + +Here’s an example command: + +```bash +./bin/pulsar-admin sink create --className org.apache.pulsar.io.cassandra --jar ~/application.jar --tenant test --namespace ns1 --name cassandra-sink --inputs test_topic +``` + +Instead of submitting a sink to run on an existing Pulsar cluster, you alternatively can run a sink as a process on your local machine: + +```bash +./bin/pulsar-admin sink localrun --className org.apache.pulsar.io.cassandra --jar ~/application.jar --tenant test --namespace ns1 --name cassandra-sink --inputs test_topic +``` + +## Available connectors + +At the moment, the following connectors are available for Pulsar: + +|Name|Java Class| +|---|---| +|[Aerospike sink](https://www.aerospike.com/)|[`org.apache.pulsar.io.aerospike.AerospikeSink`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-io/aerospike/src/main/java/org/apache/pulsar/io/aerospike/AerospikeStringSink.java)| +|[Cassandra sink](https://cassandra.apache.org)|[`org.apache.pulsar.io.cassandra.CassandraSink`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-io/cassandra/src/main/java/org/apache/pulsar/io/cassandra/CassandraStringSink.java)| +|[Kafka source](https://kafka.apache.org)|[`org.apache.pulsar.io.kafka.KafkaSource`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaStringSource.java)| +|[Kafka sink](https://kafka.apache.org)|[`org.apache.pulsar.io.kafka.KafkaSink`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaStringSink.java)| +|[RabbitMQ source](https://www.rabbitmq.com)|[`org.apache.pulsar.io.rabbitmq.RabbitMQSource`](https://github.com/apache/incubator-pulsar/blob/master/pulsar-io/rabbitmq/src/main/java/org/apache/pulsar/io/rabbitmq/RabbitMQSource.java)| +|[Twitter Firehose source](https://developer.twitter.com/en/docs)|[org.apache.pulsar.io.twitter.TwitterFireHose](https://github.com/apache/incubator-pulsar/blob/master/pulsar-io/twitter/src/main/java/org/apache/pulsar/io/twitter/TwitterFireHose.java)| + + diff --git a/site2/docs/reference-auth.md b/site2/docs/reference-auth.md new file mode 100644 index 0000000000000000000000000000000000000000..08364924af36489620fedfb576ec3fa5eb6291b7 --- /dev/null +++ b/site2/docs/reference-auth.md @@ -0,0 +1,206 @@ +--- +id: reference-auth +title: Extending Authentication and Authorization in Pulsar +sidebar_label: Authn & Authz plugins +--- + +Pulsar provides a way to use custom authentication and authorization mechanisms + +## Authentication + +Pulsar supports mutual TLS and Athenz authentication plugins, and these can be used as described +[here](administration-auth.md). + +It is possible to use a custom authentication mechanism by providing the implementation in the +form of two plugins one for the Client library and the other for the Pulsar Broker to validate +the credentials. + +### Client authentication plugin + +For client library, you will need to implement `org.apache.pulsar.client.api.Authentication`. This class can then be passed +when creating a Pulsar client: + +```java +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .authentication(new MyAuthentication()) + .build(); +``` + +For reference, there are 2 interfaces to implement on the client side: + * `Authentication` -> [Authentication API](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/Authentication.html) + * `AuthenticationDataProvider` -> [AuthenticationDataProvider API](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/AuthenticationDataProvider.html) + + +This in turn will need to provide the client credentials in the form of `org.apache.pulsar.client.api.AuthenticationDataProvider`. This will leave +the chance to return different kinds of authentication token for different +type of connection or by passing a certificate chain to use for TLS. + + +Examples for client authentication providers can be found at: + + * Mutual TLS Auth -- https://github.com/apache/incubator-pulsar/tree/master/pulsar-client/src/main/java/org/apache/pulsar/client/impl/auth + * Athenz -- https://github.com/apache/incubator-pulsar/tree/master/pulsar-client-auth-athenz/src/main/java/org/apache/pulsar/client/impl/auth + +### Broker authentication plugin + +On broker side, we need the corresponding plugin to validate the credentials +passed by the client. Broker can support multiple authentication providers +at the same time. + +In `conf/broker.conf` it's possible to specify a list of valid providers: + +```properties +# Autentication provider name list, which is comma separated list of class names +authenticationProviders= +``` + +There is one single interface to implement `org.apache.pulsar.broker.authentication.AuthenticationProvider`: + +```java +/** + * Provider of authentication mechanism + */ +public interface AuthenticationProvider extends Closeable { + + /** + * Perform initialization for the authentication provider + * + * @param config + * broker config object + * @throws IOException + * if the initialization fails + */ + void initialize(ServiceConfiguration config) throws IOException; + + /** + * @return the authentication method name supported by this provider + */ + String getAuthMethodName(); + + /** + * Validate the authentication for the given credentials with the specified authentication data + * + * @param authData + * provider specific authentication data + * @return the "role" string for the authenticated connection, if the authentication was successful + * @throws AuthenticationException + * if the credentials are not valid + */ + String authenticate(AuthenticationDataSource authData) throws AuthenticationException; + +} +``` + +Example for Broker authentication plugins: + + * Mutual TLS -- https://github.com/apache/incubator-pulsar/blob/master/pulsar-broker-common/src/main/java/org/apache/pulsar/broker/authentication/AuthenticationProviderTls.java + * Athenz -- https://github.com/apache/incubator-pulsar/blob/master/pulsar-broker-auth-athenz/src/main/java/org/apache/pulsar/broker/authentication/AuthenticationProviderAthenz.java + +## Authorization + +Authorization is the operation that checks whether a particular "role" or "principal" is +allowed to perform a certain operation. + +By default, Pulsar provides an embedded authorization, though it's possible to +configure a different one through a plugin. + +To provide a custom provider, one needs to implement the + `org.apache.pulsar.broker.authorization.AuthorizationProvider` interface, have this class in the + Pulsar broker classpath and configure it in `conf/broker.conf`: + + ```properties + # Authorization provider fully qualified class-name + authorizationProvider=org.apache.pulsar.broker.authorization.PulsarAuthorizationProvider + ``` + +```java +/** + * Provider of authorization mechanism + */ +public interface AuthorizationProvider extends Closeable { + + /** + * Perform initialization for the authorization provider + * + * @param config + * broker config object + * @param configCache + * pulsar zk configuration cache service + * @throws IOException + * if the initialization fails + */ + void initialize(ServiceConfiguration conf, ConfigurationCacheService configCache) throws IOException; + + /** + * Check if the specified role has permission to send messages to the specified fully qualified topic name. + * + * @param topicName + * the fully qualified topic name associated with the topic. + * @param role + * the app id used to send messages to the topic. + */ + CompletableFuture canProduceAsync(TopicName topicName, String role, + AuthenticationDataSource authenticationData); + + /** + * Check if the specified role has permission to receive messages from the specified fully qualified topic name. + * + * @param topicName + * the fully qualified topic name associated with the topic. + * @param role + * the app id used to receive messages from the topic. + * @param subscription + * the subscription name defined by the client + */ + CompletableFuture canConsumeAsync(TopicName topicName, String role, + AuthenticationDataSource authenticationData, String subscription); + + /** + * Check whether the specified role can perform a lookup for the specified topic. + * + * For that the caller needs to have producer or consumer permission. + * + * @param topicName + * @param role + * @return + * @throws Exception + */ + CompletableFuture canLookupAsync(TopicName topicName, String role, + AuthenticationDataSource authenticationData); + + /** + * + * Grant authorization-action permission on a namespace to the given client + * + * @param namespace + * @param actions + * @param role + * @param authDataJson + * additional authdata in json format + * @return CompletableFuture + * @completesWith
+ * IllegalArgumentException when namespace not found
+ * IllegalStateException when failed to grant permission + */ + CompletableFuture grantPermissionAsync(NamespaceName namespace, Set actions, String role, + String authDataJson); + + /** + * Grant authorization-action permission on a topic to the given client + * + * @param topicName + * @param role + * @param authDataJson + * additional authdata in json format + * @return CompletableFuture + * @completesWith
+ * IllegalArgumentException when namespace not found
+ * IllegalStateException when failed to grant permission + */ + CompletableFuture grantPermissionAsync(TopicName topicName, Set actions, String role, + String authDataJson); + +} + +``` diff --git a/site2/docs/reference-cli-tools.md b/site2/docs/reference-cli-tools.md new file mode 100644 index 0000000000000000000000000000000000000000..bc3394e675d443a4db0052df8e3dd59c5d319e7f --- /dev/null +++ b/site2/docs/reference-cli-tools.md @@ -0,0 +1,598 @@ +--- +id: reference-cli-tools +title: Pulsar command-line tools +sidebar_label: Pulsar CLI tools +--- + +Pulsar offers several command-line tools that you can use for managing Pulsar installations, performance testing, using command-line producers and consumers, and more. + +All Pulsar command-line tools can be run from the `bin` directory of your [installed Pulsar package](getting-started-standalone.md). The following tools are currently documented: + +* [`pulsar`](#pulsar) +* [`pulsar-client`](#pulsar-client) +* [`pulsar-daemon`](#pulsar-daemon) +* [`pulsar-perf`](#pulsar-perf) +* [`bookkeeper`](#bookkeeper) + +> ### Getting help +> You can get help for any CLI tool, command, or subcommand using the `--help` flag, or `-h` for short. Here's an example: +> ```shell +> $ bin/pulsar broker --help +> ``` + +## `pulsar` + +The pulsar tool is used to start Pulsar components, such as bookies and ZooKeeper, in the foreground. + +These processes can also be started in the background, using nohup, using the pulsar-daemon tool, which has the same command interface as pulsar. + +Usage: +```bash +$ pulsar command +``` +Commands: +* `bookie` +* `broker` +* `compact-topic` +* `discovery` +* `configuration-store` +* `initialize-cluster-metadata` +* `proxy` +* `standalone` +* `websocket` +* `zookeeper` +* `zookeeper-shell` + +Example: +```bash +$ pulsar broker --conf /path/to/broker.conf +``` + +The table below lists the environment variables that you can use to configure the `pulsar` tool. + +|Variable|Description|Default| +|---|---|---| +|`PULSAR_LOG_CONF`|Log4j configuration file|`conf/log4j2.yaml`| +|`PULSAR_BROKER_CONF`|Configuration file for broker|`conf/broker.conf`| +|`PULSAR_BOOKKEEPER_CONF`|description: Configuration file for bookie|`conf/bookkeeper.conf`| +|`PULSAR_ZK_CONF`|Configuration file for zookeeper|`conf/zookeeper.conf`| +|`PULSAR_CONFIGURATION_STORE_CONF`|Configuration file for the configuration store|`conf/global_zookeeper.conf`| +|`PULSAR_DISCOVERY_CONF`|Configuration file for discovery service|`conf/discovery.conf`| +|`PULSAR_WEBSOCKET_CONF`|Configuration file for websocket proxy|`conf/websocket.conf`| +|`PULSAR_STANDALONE_CONF`|Configuration file for standalone|`conf/standalone.conf`| +|`PULSAR_EXTRA_OPTS`|Extra options to be passed to the jvm|| +|`PULSAR_EXTRA_CLASSPATH`|Extra paths for Pulsar's classpath|| +|`PULSAR_PID_DIR`|Folder where the pulsar server PID file should be stored|| +|`PULSAR_STOP_TIMEOUT`|Wait time before forcefully killing the Bookie server instance if attempts to stop it are not successful|| + + + +### `bookie` + +Starts up a bookie server + +Usage: +```bash +$ pulsar bookie options +``` + +Options + +|Option|Description|Default| +|---|---|---| +|`-c`, `--config`|Configuration for the bookie server|| +|`-readOnly`|Force start a read-only bookie server|false| +|`-withAutoRecovery`|Start auto-recover service bookie server|false| + + +Example +```bash +$ pulsar bookie \ + -readOnly \ + -withAutoRecovery \ + --conf /path/to/bookkeeper.conf +``` + +### `broker` + +Starts up a Pulsar broker + +Usage +```bash +$ pulsar broker options +``` + +Options +|Option|Description|Default| +|---|---|---| +|`-c` , `--broker-conf`|Configuration file for the broker|| +|`-bc` , `--bookie-conf`|Configuration file for BookKeeper|| +|`-rb` , `--run-bookie`|Run a BookKeeper bookie on the same host as the Pulsar broker|false| +|`-ra` , `--run-bookie-autorecovery`|Run a BookKeeper autorecovery daemon on the same host as the Pulsar broker|false| + +Example +```bash +$ pulsar broker --broker-conf /path/to/broker.conf +``` + +### `compact-topic` + +Run compaction against a Pulsar topic (in a new process) + +Usage +```bash +$ pulsar compact-topic options +``` +Options +|Flag|Description|Default| +|---|---|---| +|`-t` , `--topic`|The Pulsar topic that you would like to compact|| +|`-c` , `--broker-conf`|Configuration file for the broker|${pulsarDirectory}/conf/broker.conf| + +Example +```bash +$ pulsar compact-topic --topic topic-to-compact +``` + +### `discovery` + +Run a discovery server + +Usage +```bash +$ pulsar discovery options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-c` , `--conf`|Configuration file for the discovery service|| + +Example +```bash +$ pulsar discovery --conf /path/to/discovery.conf +``` + +### `configuration-store` + +Starts up the Pulsar configuration store + +Usage +```bash +$ pulsar configuration-store options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-c` , `--conf`|Configuration file for the configuration store|| + + +### `initialize-cluster-metadata` + +One-time cluster metadata initialization + +Usage +```bash +$ pulsar initialize-cluster-metadata options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-ub` , `--broker-service-url`|The broker service URL for the new cluster|| +|`-tb` , `--broker-service-url-tls`|The broker service URL for the new cluster with TLS encryption|| +|`-c` , `--cluster`|Cluster name|| +|`--configuration-store`|The configuration store quorum connection string|| +|`-uw` , `--web-service-url`|The web service URL for the new cluster|| +|`-tw` , `--web-service-url-tls`|The web service URL for the new cluster with TLS encryption|| +|`-zk` , `--zookeeper`|The local ZooKeeper quorum connection string|| + + +### `proxy` + +Manages the Pulsar proxy + +Usage +```bash +$ pulsar proxy options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-c` , `--config`|Path to a Pulsar proxy configuration file|| +|`--configuration-store`|Configuration store connection string|| +|`-zk` , `--zookeeper-servers`|Local ZooKeeper connection string|| + +Example +```bash +$ pulsar proxy \ + --zookeeper-servers zk-0,zk-1,zk2 \ + --configuration-store zk-0,zk-1,zk-2 +``` + +### `standalone` + +Run a broker service with local bookies and local ZooKeeper + +Usage +```bash +$ pulsar standalone options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-a` , `--advertised-address`|The standalone broker advertised address|| +|`--bookkeeper-dir`|Local bookies’ base data directory|data/standalone/bookeeper| +|`--bookkeeper-port`|Local bookies’ base port|3181| +|`-c` , `--config`|Configuration file path|| +|`--no-broker`|Only start ZooKeeper and BookKeeper services, not the broker|false| +|`--num-bookies`|The number of local bookies|1| +|`--only-broker`|Only start the Pulsar broker service (not ZooKeeper or BookKeeper)|| +|`--wipe-data`|Clean up previous ZooKeeper/BookKeeper data|| +|`--zookeeper-dir`|Local ZooKeeper’s data directory|data/standalone/zookeeper| +|`--zookeeper-port` |Local ZooKeeper’s port|2181| + + +### `websocket` + +Usage +```bash +$ pulsar websocket options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration file for WebSocket service|| + + +### `zookeeper` + +Starts up a ZooKeeper cluster + +Usage +```bash +$ pulsar zookeeper options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration file for ZooKeeper|| + + +### `zookeeper-shell` + +Connects to a running ZooKeeper cluster using the ZooKeeper shell + +Usage +```bash +$ pulsar zookeeper-shell options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration file for ZooKeeper|| + + + +## `pulsar-client` + +The pulsar-client tool + +Usage +```bash +$ pulsar-client command +``` + +Commands +* `produce` +* `consume` + + +Options +|Flag|Description|Default| +|---|---|---| +|`--auth-params`|Authentication parameters, for example key1:val1,key2:val2|| +|`--auth-plugin`|Authentication plugin class name|| +|`--url`|Broker URL to which to connect|pulsar://localhost:6650/| + + +### `produce` +Send a message or messages to a specific broker and topic + +Usage +```bash +$ pulsar-client produce topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-f`, `--files`|Comma-separated file paths to send; either -m or -f must be specified|[]| +|`-m`, `--messages`|Comma-separated string of messages to send; either -m or -f must be specified|[]| +|`-n`, `--num-produce`|The number of times to send the message(s); the count of messages/files * num-produce should be below 1000|1| +|`-r`, `--rate`|Rate (in messages per second) at which to produce; a value 0 means to produce messages as fast as possible|0.0| + + +### `consume` +Consume messages from a specific broker and topic + +Usage +```bash +$ pulsar-client consume topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--hex`|Display binary messages in hexadecimal format.|false| +|`-n`, `--num-messages`|Number of messages to consume, 0 means to consume forever.|0| +|`-r`, `--rate`|Rate (in messages per second) at which to produce; a value 0 means to produce messages as fast as possible|0.0| +|`-s`, `--subscription-name`|Subscription name|| +|`-t`, `--subscription-type`|The type of the subscription. Possible values: Exclusive, Shared, Failover.|Exclusive| + + + +## `pulsar-daemon` +A wrapper around the pulsar tool that’s used to start and stop processes, such as ZooKeeper, bookies, and Pulsar brokers, in the background using nohup. + +pulsar-daemon has a similar interface to the pulsar command but adds start and stop commands for various services. For a listing of those services, run pulsar-daemon to see the help output or see the documentation for the pulsar command. + +Usage +```bash +$ pulsar-daemon command +``` + +Commands +* `start` +* `stop` + + +### `start` +Start a service in the background using nohup. + +Usage +```bash +$ pulsar-daemon start service +``` + +### `stop` +Stop a service that’s already been started using start. + +Usage +```bash +$ pulsar-daemon stop service options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|-force|Stop the service forcefully if not stopped by normal shutdown.|false| + + + +## `pulsar-perf` +A tool for performance testing a Pulsar broker. + +Usage +```bash +$ pulsar-perf command +``` + +Commands +* `consume` +* `produce` +* `monitor-brokers` +* `simulation-client` +* `simulation-controller` + +Environment variables +The table below lists the environment variables that you can use to configure the pulsar-perf tool. + +|Variable|Description|Default| +|---|---|---| +|`PULSAR_LOG_CONF`|Log4j configuration file|conf/log4j2.yaml| +|`PULSAR_CLIENT_CONF`|Configuration file for the client|conf/client.conf| +|`PULSAR_EXTRA_OPTS`|Extra options to be passed to the JVM|| +|`PULSAR_EXTRA_CLASSPATH`|Extra paths for Pulsar's classpath|| + + +### `consume` +Run a consumer + +Usage +``` +$ pulsar-perf consume options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--auth_params`|Authentication parameters in the form of key1:val1,key2:val2|| +|`--auth_plugin`|Authentication plugin class name|| +|`-b`, `--batch-time-window`|Batch messages in a window of the specified number of milliseconds|1| +|`-z`, `--compression`|Compress messages’ payload. Possible values are NONE, LZ4, or ZLIB.|| +|`--conf-file`|Configuration file|| +|`-c`, `--max-connections`|Max number of TCP connections to a single broker|0| +|`-o`, `--max-outstanding`|Max number of outstanding messages|1000| +|`-m`, `--num-messages`|Number of messages to publish in total. If set to 0, it will keep publishing.|0| +|`-n`, `--num-producers`|The number of producers (per topic)|1| +|`-t`, `--num-topic`|The number of topics|1| +|`-f`, `--payload-file`|Use payload from a file instead of an empty buffer|| +|`-r`, `--rate`|Publish rate msg/s across topics|100| +|`-u`, `--service-url`|Pulsar service URL|| +|`-s`, `--size`|Message size (in bytes)|1024| +|`-i`, `--stats-interval-seconds`|Statistics interval seconds. If 0, statistics will be disabled.|0| +|`-time`, `--test-duration`|Test duration in secs. If set to 0, it will keep publishing.|0| + + +### `produce` +Run a producer + +Usage +```bash +$ pulsar-perf produce options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--auth_params`|Authentication parameters in the form of key1:val1,key2:val2|| +|`--auth_plugin`|Authentication plugin class name|| +|`-b`, `--batch-time-window`|Batch messages in a window of the specified number of milliseconds|1| +|`-z`, `--compression`|Compress messages’ payload. Possible values are NONE, LZ4, or ZLIB.|| +|`--conf-file`|Configuration file|| +|`-c`, `--max-connections`|Max number of TCP connections to a single broker|0| +|`-o`, `--max-outstanding`|Max number of outstanding messages|1000| +|`-m`, `--num-messages`|Number of messages to publish in total. If set to 0, it will keep publishing.|0| +|`-n`, `--num-producers`|The number of producers (per topic)|1| +|`-t`, `--num-topic`|The number of topics|1| +|`-f`, `--payload-file`|Use payload from a file instead of an empty buffer|| +|`-r`, `--rate`|Publish rate msg/s across topics|100| +|`-u`, `--service-url`|Pulsar service URL|| +|`-s`, `--size`|Message size (in bytes)|1024| +|`-i`, `--stats-interval-seconds`|Statistics interval seconds. If 0, statistics will be disabled.|0| +|`-time`, `--test-duration`|Test duration in secs. If set to 0, it will keep publishing.|0| + + + +### `monitor-brokers` +Continuously receive broker data and/or load reports + +Usage +```bash +$ pulsar-perf monitor-brokers options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--connect-string`|A connection string for one or more ZooKeeper servers|| + + +### `simulation-client` +Run a simulation server acting as a Pulsar client. Uses the client configuration specified in conf/client.conf. + +Usage +```bash +$ pulsar-perf simulation-client +``` + + +### `simulation-controller` +Run a simulation controller to give commands to servers + +Usage +```bash +$ pulsar-perf simulation-controller options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--client-port`|The port that the clients are listening on|0| +|`--clients`|Comma-separated list of client hostnames|| +|`--cluster`|The cluster to test on|| + + +## `bookkeeper` +A tool for managing BookKeeper. + +Usage +```bash +$ bookkeeper command +``` + +Commands +* `auto-recovery` +* `bookie` +* `localbookie` +* `upgrade` +* `shell` + + +Environment variables +The table below lists the environment variables that you can use to configure the bookkeeper tool. + +|Variable|Description|Default| +|---|---|---| +|BOOKIE_LOG_CONF|Log4j configuration file|conf/log4j2.yaml| +|BOOKIE_CONF|BookKeeper configuration file|conf/bk_server.conf| +|BOOKIE_EXTRA_OPTS|Extra options to be passed to the JVM|| +|BOOKIE_EXTRA_CLASSPATH|Extra paths for BookKeeper's classpath|| +|ENTRY_FORMATTER_CLASS|The Java class used to format entries|| +|BOOKIE_PID_DIR|Folder where the BookKeeper server PID file should be stored|| +|BOOKIE_STOP_TIMEOUT|Wait time before forcefully killing the Bookie server instance if attempts to stop it are not successful|| + + +### `auto-recovery` +Runs an auto-recovery service daemon + +Usage +```bash +$ bookkeeper auto-recovery options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration for the auto-recovery daemon|| + + +### `bookie` +Starts up a BookKeeper server (aka bookie) + +Usage +```bash +$ bookkeeper bookie options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration for the auto-recovery daemon|| +|-readOnly|Force start a read-only bookie server|false| +|-withAutoRecovery|Start auto-recovery service bookie server|false| + + +### `localbookie` +Runs a test ensemble of N bookies locally + +Usage +```bash +$ bookkeeper localbookie N +``` + +### `upgrade` +Upgrade the bookie’s filesystem + +Usage +```bash +$ bookkeeper upgrade options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration for the auto-recovery daemon|| +|`-u`, `--upgrade`|Upgrade the bookie’s directories|| + + +### `shell` +Run shell for admin commands. To see a full listing of those commands, run bookkeeper shell without an argument. + +Usage +```bash +$ bookkeeper shell +``` + +Example +```bash +$ bookkeeper shell bookiesanity +``` + diff --git a/site2/docs/reference-configuration.md b/site2/docs/reference-configuration.md new file mode 100644 index 0000000000000000000000000000000000000000..b1484f6596822e807ce3d18778d44bdd0e5782f0 --- /dev/null +++ b/site2/docs/reference-configuration.md @@ -0,0 +1,467 @@ +--- +id: reference-configuration +title: Pulsar configuration +sidebar_label: Pulsar configuration +--- + + + + +Pulsar configuration can be managed either via a series of configuration files contained in the [`conf`](https://github.com/apache/incubator-pulsar/tree/master/conf) directory of a Pulsar [installation](getting-started-standalone.md) + +* [BookKeeper](#bookkeeper) +* [Broker](#broker) +* [Client](#client) +* [Service discovery](#service-discovery) +* [Log4j](#log4j) +* [Log4j shell](#log4j-shell) +* [Standalone](#standalone) +* [WebSocket](#websocket) +* [ZooKeeper](#zookeeper) + +## BookKeeper + +BookKeeper is a replicated log storage system that Pulsar uses for durable storage of all messages. + + +|Name|Description|Default| +|---|---|---| +|bookiePort|The port on which the bookie server listens.|3181| +|allowLoopback|Whether the bookie is allowed to use a loopback interface as its primary interface (i.e. the interface used to establish its identity). By default, loopback interfaces are not allowed as the primary interface. Using a loopback interface as the primary interface usually indicates a configuration error. For example, it’s fairly common in some VPS setups to not configure a hostname or to have the hostname resolve to `127.0.0.1`. If this is the case, then all bookies in the cluster will establish their identities as `127.0.0.1:3181` and only one will be able to join the cluster. For VPSs configured like this, you should explicitly set the listening interface.|false| +|listeningInterface|The network interface on which the bookie listens. If not set, the bookie will listen on all interfaces.|eth0| +|journalDirectory|The directory where Bookkeeper outputs its write-ahead log (WAL)|data/bookkeeper/journal| +|ledgerDirectories|The directory where Bookkeeper outputs ledger snapshots. This could define multiple directories to store snapshots separated by comma, for example `ledgerDirectories=/tmp/bk1-data,/tmp/bk2-data`. Ideally, ledger dirs and the journal dir are each in a different device, which reduces the contention between random I/O and sequential write. It is possible to run with a single disk, but performance will be significantly lower.|data/bookkeeper/ledgers| +|ledgerManagerType|The type of ledger manager used to manage how ledgers are stored, managed, and garbage collected. See [BookKeeper Internals](http://bookkeeper.apache.org/docs/latest/getting-started/concepts) for more info.|hierarchical| +|zkLedgersRootPath|The root ZooKeeper path used to store ledger metadata. This parameter is used by the ZooKeeper-based ledger manager as a root znode to store all ledgers.|/ledgers| +|ledgerStorageClass|Ledger storage implementation class|org.apache.bookkeeper.bookie.storage.ldb.DbLedgerStorage| +|entryLogFilePreallocationEnabled|Enable or disable entry logger preallocation|true| +|logSizeLimit|Max file size of the entry logger, in bytes. A new entry log file will be created when the old one reaches the file size limitation.|2147483648| +|minorCompactionThreshold|Threshold of minor compaction. Entry log files whose remaining size percentage reaches below this threshold will be compacted in a minor compaction. If set to less than zero, the minor compaction is disabled.|0.2| +|minorCompactionInterval|Time interval to run minor compaction, in seconds. If set to less than zero, the minor compaction is disabled.|3600| +|majorCompactionThreshold|The threshold of major compaction. Entry log files whose remaining size percentage reaches below this threshold will be compacted in a major compaction. Those entry log files whose remaining size percentage is still higher than the threshold will never be compacted. If set to less than zero, the minor compaction is disabled.|0.5| +|majorCompactionInterval|The time interval to run major compaction, in seconds. If set to less than zero, the major compaction is disabled.|86400| +|compactionMaxOutstandingRequests|Sets the maximum number of entries that can be compacted without flushing. When compacting, the entries are written to the entrylog and the new offsets are cached in memory. Once the entrylog is flushed the index is updated with the new offsets. This parameter controls the number of entries added to the entrylog before a flush is forced. A higher value for this parameter means more memory will be used for offsets. Each offset consists of 3 longs. This parameter should not be modified unless you’re fully aware of the consequences.|100000| +|compactionRate|The rate at which compaction will read entries, in adds per second.|1000| +|isThrottleByBytes|Throttle compaction by bytes or by entries.|false| +|compactionRateByEntries|The rate at which compaction will read entries, in adds per second.|1000| +|compactionRateByBytes|Set the rate at which compaction will readd entries. The unit is bytes added per second.|1000000| +|journalMaxSizeMB|Max file size of journal file, in megabytes. A new journal file will be created when the old one reaches the file size limitation.|2048| +|journalMaxBackups|The max number of old journal filse to keep. Keeping a number of old journal files would help data recovery in special cases.|5| +|journalPreAllocSizeMB|How space to pre-allocate at a time in the journal.|16| +|journalWriteBufferSizeKB|The of the write buffers used for the journal.|64| +|journalRemoveFromPageCache|Whether pages should be removed from the page cache after force write.|true| +|journalAdaptiveGroupWrites|Whether to group journal force writes, which optimizes group commit for higher throughput.|true| +|journalMaxGroupWaitMSec|The maximum latency to impose on a journal write to achieve grouping.|1| +|journalAlignmentSize|All the journal writes and commits should be aligned to given size|4096| +|journalBufferedWritesThreshold|Maximum writes to buffer to achieve grouping|524288| +|journalFlushWhenQueueEmpty|If we should flush the journal when journal queue is empty|false| +|numJournalCallbackThreads|The number of threads that should handle journal callbacks|8| +|rereplicationEntryBatchSize|The number of max entries to keep in fragment for re-replication|5000| +|gcWaitTime|How long the interval to trigger next garbage collection, in milliseconds. Since garbage collection is running in background, too frequent gc will heart performance. It is better to give a higher number of gc interval if there is enough disk capacity.|900000| +|gcOverreplicatedLedgerWaitTime|How long the interval to trigger next garbage collection of overreplicated ledgers, in milliseconds. This should not be run very frequently since we read the metadata for all the ledgers on the bookie from zk.|86400000| +|flushInterval|How long the interval to flush ledger index pages to disk, in milliseconds. Flushing index files will introduce much random disk I/O. If separating journal dir and ledger dirs each on different devices, flushing would not affect performance. But if putting journal dir and ledger dirs on same device, performance degrade significantly on too frequent flushing. You can consider increment flush interval to get better performance, but you need to pay more time on bookie server restart after failure.|60000| +|bookieDeathWatchInterval|Interval to watch whether bookie is dead or not, in milliseconds|1000| +|zkServers|A list of one of more servers on which zookeeper is running. The server list can be comma separated values, for example: zkServers=zk1:2181,zk2:2181,zk3:2181.|localhost:2181| +|zkTimeout|ZooKeeper client session timeout in milliseconds Bookie server will exit if it received SESSION_EXPIRED because it was partitioned off from ZooKeeper for more than the session timeout JVM garbage collection, disk I/O will cause SESSION_EXPIRED. Increment this value could help avoiding this issue|30000| +|serverTcpNoDelay|This settings is used to enabled/disabled Nagle’s algorithm, which is a means of improving the efficiency of TCP/IP networks by reducing the number of packets that need to be sent over the network. If you are sending many small messages, such that more than one can fit in a single IP packet, setting server.tcpnodelay to false to enable Nagle algorithm can provide better performance.|true| +|openFileLimit|Max number of ledger index files could be opened in bookie server If number of ledger index files reaches this limitation, bookie server started to swap some ledgers from memory to disk. Too frequent swap will affect performance. You can tune this number to gain performance according your requirements.|0| +|pageSize|Size of a index page in ledger cache, in bytes A larger index page can improve performance writing page to disk, which is efficent when you have small number of ledgers and these ledgers have similar number of entries. If you have large number of ledgers and each ledger has fewer entries, smaller index page would improve memory usage.|8192| +|pageLimit|How many index pages provided in ledger cache If number of index pages reaches this limitation, bookie server starts to swap some ledgers from memory to disk. You can increment this value when you found swap became more frequent. But make sure pageLimit*pageSize should not more than JVM max memory limitation, otherwise you would got OutOfMemoryException. In general, incrementing pageLimit, using smaller index page would gain bettern performance in lager number of ledgers with fewer entries case If pageLimit is -1, bookie server will use 1/3 of JVM memory to compute the limitation of number of index pages.|0| +|readOnlyModeEnabled|If all ledger directories configured are full, then support only read requests for clients. If “readOnlyModeEnabled=true” then on all ledger disks full, bookie will be converted to read-only mode and serve only read requests. Otherwise the bookie will be shutdown. By default this will be disabled.|true| +|diskUsageThreshold|For each ledger dir, maximum disk space which can be used. Default is 0.95f. i.e. 95% of disk can be used at most after which nothing will be written to that partition. If all ledger dir partions are full, then bookie will turn to readonly mode if ‘readOnlyModeEnabled=true’ is set, else it will shutdown. Valid values should be in between 0 and 1 (exclusive).|0.95| +|diskCheckInterval|Disk check interval in milli seconds, interval to check the ledger dirs usage.|10000| +|auditorPeriodicCheckInterval|Interval at which the auditor will do a check of all ledgers in the cluster. By default this runs once a week. The interval is set in seconds. To disable the periodic check completely, set this to 0. Note that periodic checking will put extra load on the cluster, so it should not be run more frequently than once a day.|604800| +|auditorPeriodicBookieCheckInterval|The interval between auditor bookie checks. The auditor bookie check, checks ledger metadata to see which bookies should contain entries for each ledger. If a bookie which should contain entries is unavailable, thea the ledger containing that entry is marked for recovery. Setting this to 0 disabled the periodic check. Bookie checks will still run when a bookie fails. The interval is specified in seconds.|86400| +|numAddWorkerThreads|number of threads that should handle write requests. if zero, the writes would be handled by netty threads directly.|0| +|numReadWorkerThreads|number of threads that should handle read requests. if zero, the reads would be handled by netty threads directly.|8| +|maxPendingReadRequestsPerThread|If read workers threads are enabled, limit the number of pending requests, to avoid the executor queue to grow indefinitely.|2500| +|readBufferSizeBytes|The number of bytes we should use as capacity for BufferedReadChannel.|4096| +|writeBufferSizeBytes|The number of bytes used as capacity for the write buffer|65536| +|useHostNameAsBookieID|Whether the bookie should use its hostname to register with the coordination service (e.g.: zookeeper service). When false, bookie will use its ipaddress for the registration.|false| +|statsProviderClass||org.apache.bookkeeper.stats.prometheus.PrometheusMetricsProvider| +|prometheusStatsHttpPort||8000| +|dbStorage_writeCacheMaxSizeMb|Size of Write Cache. Memory is allocated from JVM direct memory. Write cache is used to buffer entries before flushing into the entry log For good performance, it should be big enough to hold a sub|512| +|dbStorage_readAheadCacheMaxSizeMb|Size of Read cache. Memory is allocated from JVM direct memory. This read cache is pre-filled doing read-ahead whenever a cache miss happens|256| +|dbStorage_readAheadCacheBatchSize|How many entries to pre-fill in cache after a read cache miss|1000| +|dbStorage_rocksDB_blockCacheSize|Size of RocksDB block-cache. For best performance, this cache should be big enough to hold a significant portion of the index database which can reach ~2GB in some cases|268435456| +|dbStorage_rocksDB_writeBufferSizeMB||64| +|dbStorage_rocksDB_sstSizeInMB||64| +|dbStorage_rocksDB_blockSize||65536| +|dbStorage_rocksDB_bloomFilterBitsPerKey||10| +|dbStorage_rocksDB_numLevels||-1| +|dbStorage_rocksDB_numFilesInLevel0||4| +|dbStorage_rocksDB_maxSizeInLevel1MB||256| + + + +## Broker + +Pulsar brokers are responsible for handling incoming messages from producers, dispatching messages to consumers, replicating data between clusters, and more. + +|Name|Description|Default| +|---|---|---| +|enablePersistentTopics| Whether persistent topics are enabled on the broker |true| +|enableNonPersistentTopics| Whether non-persistent topics are enabled on the broker |true| +|functionsWorkerEnabled| Whether the Pulsar Functions worker service is enabled in the broker |false| +|zookeeperServers| Zookeeper quorum connection string || +|globalZookeeperServers| Global Zookeeper quorum connection string || +|brokerServicePort| Broker data port |6650| +|brokerServicePortTls| Broker data port for TLS |6651| +|webServicePort| Port to use to server HTTP request |8080| +|webServicePortTls| Port to use to server HTTPS request |8443| +|webSocketServiceEnabled| Enable the WebSocket API service in broker |false| +|bindAddress| Hostname or IP address the service binds on, default is 0.0.0.0. |0.0.0.0| +|advertisedAddress| Hostname or IP address the service advertises to the outside world. If not set, the value of `InetAddress.getLocalHost().getHostName()` is used. || +|clusterName| Name of the cluster to which this broker belongs to || +|brokerDeduplicationEnabled| Sets the default behavior for message deduplication in the broker. If enabled, the broker will reject messages that were already stored in the topic. This setting can be overridden on a per-namespace basis. |false| +|brokerDeduplicationMaxNumberOfProducers| The maximum number of producers for which information will be stored for deduplication purposes. |10000| +|brokerDeduplicationEntriesInterval| The number of entries after which a deduplication informational snapshot is taken. A larger interval will lead to fewer snapshots being taken, though this would also lengthen the topic recovery time (the time required for entries published after the snapshot to be replayed). |1000| +|brokerDeduplicationProducerInactivityTimeoutMinutes| The time of inactivity (in minutes) after which the broker will discard deduplication information related to a disconnected producer. |360| +|zooKeeperSessionTimeoutMillis| Zookeeper session timeout in milliseconds |30000| +|brokerShutdownTimeoutMs| Time to wait for broker graceful shutdown. After this time elapses, the process will be killed |3000| +|backlogQuotaCheckEnabled| Enable backlog quota check. Enforces action on topic when the quota is reached |true| +|backlogQuotaCheckIntervalInSeconds| How often to check for topics that have reached the quota |60| +|backlogQuotaDefaultLimitGB| Default per-topic backlog quota limit |10| +|brokerDeleteInactiveTopicsEnabled| Enable the deletion of inactive topics |true| +|brokerDeleteInactiveTopicsFrequencySeconds| How often to check for inactive topics |60| +|messageExpiryCheckIntervalInMinutes| How frequently to proactively check and purge expired messages |5| +|brokerServiceCompactionMonitorIntervalInSeconds| Interval between checks to see if topics with compaction policies need to be compacted |60| +|activeConsumerFailoverDelayTimeMillis| How long to delay rewinding cursor and dispatching messages when active consumer is changed. |1000| +|clientLibraryVersionCheckEnabled| Enable check for minimum allowed client library version |false| +|clientLibraryVersionCheckAllowUnversioned| Allow client libraries with no version information |true| +|statusFilePath| Path for the file used to determine the rotation status for the broker when responding to service discovery health checks || +|preferLaterVersions| If true, (and ModularLoadManagerImpl is being used), the load manager will attempt to use only brokers running the latest software version (to minimize impact to bundles) |false| +|tlsEnabled| Enable TLS |false| +|tlsCertificateFilePath| Path for the TLS certificate file || +|tlsKeyFilePath| Path for the TLS private key file || +|tlsTrustCertsFilePath| Path for the trusted TLS certificate file || +|tlsAllowInsecureConnection| Accept untrusted TLS certificate from client |false| +|maxUnackedMessagesPerConsumer| Max number of unacknowledged messages allowed to receive messages by a consumer on a shared subscription. Broker will stop sending messages to consumer once, this limit reaches until consumer starts acknowledging messages back. Using a value of 0, is disabling unackeMessage limit check and consumer can receive messages without any restriction |50000| +|maxUnackedMessagesPerSubscription| Max number of unacknowledged messages allowed per shared subscription. Broker will stop dispatching messages to all consumers of the subscription once this limit reaches until consumer starts acknowledging messages back and unack count reaches to limit/2. Using a value of 0, is disabling unackedMessage-limit check and dispatcher can dispatch messages without any restriction |200000| +|maxConcurrentLookupRequest| Max number of concurrent lookup request broker allows to throttle heavy incoming lookup traffic |10000| +|maxConcurrentTopicLoadRequest| Max number of concurrent topic loading request broker allows to control number of zk-operations |5000| +|authenticationEnabled| Enable authentication |false| +|authenticationProviders| Autentication provider name list, which is comma separated list of class names || +|authorizationEnabled| Enforce authorization |false| +|superUserRoles| Role names that are treated as “super-user”, meaning they will be able to do all admin operations and publish/consume from all topics || +|brokerClientAuthenticationPlugin| Authentication settings of the broker itself. Used when the broker connects to other brokers, either in same or other clusters || +|brokerClientAuthenticationParameters||| +|athenzDomainNames| Supported Athenz provider domain names(comma separated) for authentication || +|bookkeeperClientAuthenticationPlugin| Authentication plugin to use when connecting to bookies || +|bookkeeperClientAuthenticationParametersName| BookKeeper auth plugin implementatation specifics parameters name and values || +|bookkeeperClientAuthenticationParameters||| +|bookkeeperClientTimeoutInSeconds| Timeout for BK add / read operations |30| +|bookkeeperClientSpeculativeReadTimeoutInMillis| Speculative reads are initiated if a read request doesn’t complete within a certain time Using a value of 0, is disabling the speculative reads |0| +|bookkeeperClientHealthCheckEnabled| Enable bookies health check. Bookies that have more than the configured number of failure within the interval will be quarantined for some time. During this period, new ledgers won’t be created on these bookies |true| +|bookkeeperClientHealthCheckIntervalSeconds||60| +|bookkeeperClientHealthCheckErrorThresholdPerInterval||5| +|bookkeeperClientHealthCheckQuarantineTimeInSeconds ||1800| +|bookkeeperClientRackawarePolicyEnabled| Enable rack-aware bookie selection policy. BK will chose bookies from different racks when forming a new bookie ensemble |true| +|bookkeeperClientIsolationGroups| Enable bookie isolation by specifying a list of bookie groups to choose from. Any bookie outside the specified groups will not be used by the broker || +|managedLedgerDefaultEnsembleSize| Number of bookies to use when creating a ledger |2| +|managedLedgerDefaultWriteQuorum| Number of copies to store for each message |2| +|managedLedgerDefaultAckQuorum| Number of guaranteed copies (acks to wait before write is complete) |2| +|managedLedgerCacheSizeMB| Amount of memory to use for caching data payload in managed ledger. This memory is allocated from JVM direct memory and it’s shared across all the topics running in the same broker |1024| +|managedLedgerCacheEvictionWatermark| Threshold to which bring down the cache level when eviction is triggered |0.9| +|managedLedgerDefaultMarkDeleteRateLimit| Rate limit the amount of writes per second generated by consumer acking the messages |1.0| +|managedLedgerMaxEntriesPerLedger| Max number of entries to append to a ledger before triggering a rollover. A ledger rollover is triggered on these conditions:
  • Either the max rollover time has been reached
  • or max entries have been written to the ledged and at least min-time has passed
|50000| +|managedLedgerMinLedgerRolloverTimeMinutes| Minimum time between ledger rollover for a topic |10| +|managedLedgerMaxLedgerRolloverTimeMinutes| Maximum time before forcing a ledger rollover for a topic |240| +|managedLedgerCursorMaxEntriesPerLedger| Max number of entries to append to a cursor ledger |50000| +|managedLedgerCursorRolloverTimeInSeconds| Max time before triggering a rollover on a cursor ledger |14400| +|managedLedgerMaxUnackedRangesToPersist| Max number of “acknowledgment holes” that are going to be persistently stored. When acknowledging out of order, a consumer will leave holes that are supposed to be quickly filled by acking all the messages. The information of which messages are acknowledged is persisted by compressing in “ranges” of messages that were acknowledged. After the max number of ranges is reached, the information will only be tracked in memory and messages will be redelivered in case of crashes. |1000| +|autoSkipNonRecoverableData| Skip reading non-recoverable/unreadable data-ledger under managed-ledger’s list.It helps when data-ledgers gets corrupted at bookkeeper and managed-cursor is stuck at that ledger. |false| +|loadBalancerEnabled| Enable load balancer |true| +|loadBalancerPlacementStrategy| Strategy to assign a new bundle weightedRandomSelection || +|loadBalancerReportUpdateThresholdPercentage| Percentage of change to trigger load report update |10| +|loadBalancerReportUpdateMaxIntervalMinutes| maximum interval to update load report |15| +|loadBalancerHostUsageCheckIntervalMinutes| Frequency of report to collect |1| +|loadBalancerSheddingIntervalMinutes| Load shedding interval. Broker periodically checks whether some traffic should be offload from some over-loaded broker to other under-loaded brokers |30| +|loadBalancerSheddingGracePeriodMinutes| Prevent the same topics to be shed and moved to other broker more that once within this timeframe |30| +|loadBalancerBrokerMaxTopics| Usage threshold to allocate max number of topics to broker |50000| +|loadBalancerBrokerUnderloadedThresholdPercentage| Usage threshold to determine a broker as under-loaded |1| +|loadBalancerBrokerOverloadedThresholdPercentage| Usage threshold to determine a broker as over-loaded |85| +|loadBalancerResourceQuotaUpdateIntervalMinutes| Interval to update namespace bundle resource quotat |15| +|loadBalancerBrokerComfortLoadLevelPercentage| Usage threshold to determine a broker is having just right level of load |65| +|loadBalancerAutoBundleSplitEnabled| enable/disable namespace bundle auto split |false| +|loadBalancerNamespaceBundleMaxTopics| maximum topics in a bundle, otherwise bundle split will be triggered |1000| +|loadBalancerNamespaceBundleMaxSessions| maximum sessions (producers + consumers) in a bundle, otherwise bundle split will be triggered |1000| +|loadBalancerNamespaceBundleMaxMsgRate| maximum msgRate (in + out) in a bundle, otherwise bundle split will be triggered |1000| +|loadBalancerNamespaceBundleMaxBandwidthMbytes| maximum bandwidth (in + out) in a bundle, otherwise bundle split will be triggered |100| +|loadBalancerNamespaceMaximumBundles| maximum number of bundles in a namespace |128| +|replicationMetricsEnabled| Enable replication metrics |true| +|replicationConnectionsPerBroker| Max number of connections to open for each broker in a remote cluster More connections host-to-host lead to better throughput over high-latency links. |16| +|replicationProducerQueueSize| Replicator producer queue size |1000| +|replicatorPrefix| Replicator prefix used for replicator producer name and cursor name pulsar.repl|| +|replicationTlsEnabled| Enable TLS when talking with other clusters to replicate messages |false| +|defaultRetentionTimeInMinutes| Default message retention time || +|defaultRetentionSizeInMB| Default retention size |0| +|keepAliveIntervalSeconds| How often to check whether the connections are still alive |30| +|brokerServicePurgeInactiveFrequencyInSeconds| How often broker checks for inactive topics to be deleted (topics with no subscriptions and no one connected) |60| +|loadManagerClassName| Name of load manager to use |org.apache.pulsar.broker.loadbalance.impl.SimpleLoadManagerImpl| +|managedLedgerOffloadDriver| Driver to use to offload old data to long term storage (Possible values: S3) || +|managedLedgerOffloadMaxThreads| Maximum number of thread pool threads for ledger offloading |2| +|s3ManagedLedgerOffloadRegion| For Amazon S3 ledger offload, AWS region || +|s3ManagedLedgerOffloadBucket| For Amazon S3 ledger offload, Bucket to place offloaded ledger into || +|s3ManagedLedgerOffloadServiceEndpoint| For Amazon S3 ledger offload, Alternative endpoint to connect to (useful for testing) || +|s3ManagedLedgerOffloadMaxBlockSizeInBytes| For Amazon S3 ledger offload, Max block size in bytes. (64MB by default, 5MB minimum) |67108864| +|s3ManagedLedgerOffloadReadBufferSizeInBytes| For Amazon S3 ledger offload, Read buffer size in bytes (1MB by default) |1048576| + + + + +## Client + +The [`pulsar-client`](reference-cli-tools.md#pulsar-client) CLI tool can be used to publish messages to Pulsar and consume messages from Pulsar topics. This tool can be used in lieu of a client library. + +|Name|Description|Default| +|---|---|---| +|webServiceUrl| The web URL for the cluster. |http://localhost:8080/| +|brokerServiceUrl| The Pulsar protocol URL for the cluster. |pulsar://localhost:6650/| +|authPlugin| The authentication plugin. || +|authParams| The authentication parameters for the cluster, as a comma-separated string. || +|useTls| Whether or not TLS authentication will be enforced in the cluster. |false| +|tlsAllowInsecureConnection||| +|tlsTrustCertsFilePath||| + + +## Service discovery + +|Name|Description|Default| +|---|---|---| +|zookeeperServers| Zookeeper quorum connection string (comma-separated) || +|globalZookeeperServers| Global zookeeper quorum connection string (comma-separated) || +|zookeeperSessionTimeoutMs| ZooKeeper session timeout |30000| +|servicePort| Port to use to server binary-proto request |6650| +|servicePortTls| Port to use to server binary-proto-tls request |6651| +|webServicePort| Port that discovery service listen on |8080| +|webServicePortTls| Port to use to server HTTPS request |8443| +|bindOnLocalhost| Control whether to bind directly on localhost rather than on normal hostname |false| +|authenticationEnabled| Enable authentication |false| +|authenticationProviders| Authentication provider name list, which is comma separated list of class names (comma-separated) || +|authorizationEnabled| Enforce authorization |false| +|superUserRoles| Role names that are treated as “super-user”, meaning they will be able to do all admin operations and publish/consume from all topics (comma-separated) || +|tlsEnabled| Enable TLS |false| +|tlsCertificateFilePath| Path for the TLS certificate file || +|tlsKeyFilePath| Path for the TLS private key file || + + + +## Log4j + + +|Name|Default| +|---|---| +|pulsar.root.logger| WARN,CONSOLE| +|pulsar.log.dir| logs| +|pulsar.log.file| pulsar.log| +|log4j.rootLogger| ${pulsar.root.logger}| +|log4j.appender.CONSOLE| org.apache.log4j.ConsoleAppender| +|log4j.appender.CONSOLE.Threshold| DEBUG| +|log4j.appender.CONSOLE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.CONSOLE.layout.ConversionPattern| %d{ISO8601} - %-5p - [%t:%C{1}@%L] - %m%n| +|log4j.appender.ROLLINGFILE| org.apache.log4j.DailyRollingFileAppender| +|log4j.appender.ROLLINGFILE.Threshold| DEBUG| +|log4j.appender.ROLLINGFILE.File| ${pulsar.log.dir}/${pulsar.log.file}| +|log4j.appender.ROLLINGFILE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.ROLLINGFILE.layout.ConversionPattern| %d{ISO8601} - %-5p [%t:%C{1}@%L] - %m%n| +|log4j.appender.TRACEFILE| org.apache.log4j.FileAppender| +|log4j.appender.TRACEFILE.Threshold| TRACE| +|log4j.appender.TRACEFILE.File| pulsar-trace.log| +|log4j.appender.TRACEFILE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.TRACEFILE.layout.ConversionPattern| %d{ISO8601} - %-5p [%t:%C{1}@%L][%x] - %m%n| + + +## Log4j shell + +|Name|Default| +|---|---| +|bookkeeper.root.logger| ERROR,CONSOLE| +|log4j.rootLogger| ${bookkeeper.root.logger}| +|log4j.appender.CONSOLE| org.apache.log4j.ConsoleAppender| +|log4j.appender.CONSOLE.Threshold| DEBUG| +|log4j.appender.CONSOLE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.CONSOLE.layout.ConversionPattern| %d{ABSOLUTE} %-5p %m%n| +|log4j.logger.org.apache.zookeeper| ERROR| +|log4j.logger.org.apache.bookkeeper| ERROR| +|log4j.logger.org.apache.bookkeeper.bookie.BookieShell| INFO| + + +## Standalone + +|Name|Description|Default| +|---|---|---| +|zookeeperServers| The quorum connection string for local ZooKeeper || +|globalZookeeperServers| The quorum connection string for global ZooKeeper || +|brokerServicePort| The port on which the standalone broker listens for connections |6650| +|webServicePort| THe port used by the standalone broker for HTTP requests |8080| +|bindAddress| The hostname or IP address on which the standalone service binds |0.0.0.0| +|advertisedAddress| The hostname or IP address that the standalone service advertises to the outside world. If not set, the value of `InetAddress.getLocalHost().getHostName()` is used. || +|clusterName| The name of the cluster that this broker belongs to. |standalone| +|zooKeeperSessionTimeoutMillis| The ZooKeeper session timeout, in milliseconds. |30000| +|brokerShutdownTimeoutMs| The time to wait for graceful broker shutdown. After this time elapses, the process will be killed. |3000| +|backlogQuotaCheckEnabled| Enable the backlog quota check, which enforces a specified action when the quota is reached. |true| +|backlogQuotaCheckIntervalInSeconds| How often to check for topics that have reached the backlog quota. |60| +|backlogQuotaDefaultLimitGB| The default per-topic backlog quota limit. |10| +|brokerDeleteInactiveTopicsEnabled| Enable the deletion of inactive topics. |true| +|brokerDeleteInactiveTopicsFrequencySeconds| How often to check for inactive topics, in seconds. |60| +|messageExpiryCheckIntervalInMinutes| How often to proactively check and purged expired messages. |5| +|activeConsumerFailoverDelayTimeMillis| How long to delay rewinding cursor and dispatching messages when active consumer is changed. |1000| +|clientLibraryVersionCheckEnabled| Enable checks for minimum allowed client library version. |false| +|clientLibraryVersionCheckAllowUnversioned| Allow client libraries with no version information |true| +|statusFilePath| The path for the file used to determine the rotation status for the broker when responding to service discovery health checks |/usr/local/apache/htdocs| +|maxUnackedMessagesPerConsumer| The maximum number of unacknowledged messages allowed to be received by consumers on a shared subscription. The broker will stop sending messages to a consumer once this limit is reached or until the consumer begins acknowledging messages. A value of 0 disables the unacked message limit check and thus allows consumers to receive messages without any restrictions. |50000| +|maxUnackedMessagesPerSubscription| The same as above, except per subscription rather than per consumer. |200000| +|authenticationEnabled| Enable authentication for the broker. |false| +|authenticationProviders| A comma-separated list of class names for authentication providers. |false| +|authorizationEnabled| Enforce authorization in brokers. |false| +|superUserRoles| Role names that are treated as “superusers.” Superusers are authorized to perform all admin tasks. || +|brokerClientAuthenticationPlugin| The authentication settings of the broker itself. Used when the broker connects to other brokers either in the same cluster or from other clusters. || +|brokerClientAuthenticationParameters| The parameters that go along with the plugin specified using brokerClientAuthenticationPlugin. || +|athenzDomainNames| Supported Athenz authentication provider domain names as a comma-separated list. || +|bookkeeperClientAuthenticationPlugin| Authentication plugin to be used when connecting to bookies (BookKeeper servers). || +|bookkeeperClientAuthenticationParametersName| BookKeeper authentication plugin implementation parameters and values. || +|bookkeeperClientAuthenticationParameters| Parameters associated with the bookkeeperClientAuthenticationParametersName || +|bookkeeperClientTimeoutInSeconds| Timeout for BookKeeper add and read operations. |30| +|bookkeeperClientSpeculativeReadTimeoutInMillis| Speculative reads are initiated if a read request doesn’t complete within a certain time. A value of 0 disables speculative reads. |0| +|bookkeeperClientHealthCheckEnabled| Enable bookie health checks. |true| +|bookkeeperClientHealthCheckIntervalSeconds| The time interval, in seconds, at which health checks are performed. New ledgers are not created during health checks. |60| +|bookkeeperClientHealthCheckErrorThresholdPerInterval| Error threshold for health checks. |5| +|bookkeeperClientHealthCheckQuarantineTimeInSeconds| If bookies have more than the allowed number of failures within the time interval specified by bookkeeperClientHealthCheckIntervalSeconds |1800| +|bookkeeperClientRackawarePolicyEnabled| |true| +|bookkeeperClientIsolationGroups||| +|managedLedgerDefaultEnsembleSize| |1| +|managedLedgerDefaultWriteQuorum| |1| +|managedLedgerDefaultAckQuorum| |1| +|managedLedgerCacheSizeMB| |1024| +|managedLedgerCacheEvictionWatermark| |0.9| +|managedLedgerDefaultMarkDeleteRateLimit| |0.1| +|managedLedgerMaxEntriesPerLedger| |50000| +|managedLedgerMinLedgerRolloverTimeMinutes| |10| +|managedLedgerMaxLedgerRolloverTimeMinutes| |240| +|managedLedgerCursorMaxEntriesPerLedger| |50000| +|managedLedgerCursorRolloverTimeInSeconds| |14400| +|autoSkipNonRecoverableData| |false| +|loadBalancerEnabled| |false| +|loadBalancerPlacementStrategy| |weightedRandomSelection| +|loadBalancerReportUpdateThresholdPercentage| |10| +|loadBalancerReportUpdateMaxIntervalMinutes| |15| +|loadBalancerHostUsageCheckIntervalMinutes| |1| +|loadBalancerSheddingIntervalMinutes| |30| +|loadBalancerSheddingGracePeriodMinutes| |30| +|loadBalancerBrokerMaxTopics| |50000| +|loadBalancerBrokerUnderloadedThresholdPercentage| |1| +|loadBalancerBrokerOverloadedThresholdPercentage| |85| +|loadBalancerResourceQuotaUpdateIntervalMinutes| |15| +|loadBalancerBrokerComfortLoadLevelPercentage| |65| +|loadBalancerAutoBundleSplitEnabled| |false| +|loadBalancerNamespaceBundleMaxTopics| |1000| +|loadBalancerNamespaceBundleMaxSessions| |1000| +|loadBalancerNamespaceBundleMaxMsgRate| |1000| +|loadBalancerNamespaceBundleMaxBandwidthMbytes| |100| +|loadBalancerNamespaceMaximumBundles| |128| +|replicationMetricsEnabled| |true| +|replicationConnectionsPerBroker| |16| +|replicationProducerQueueSize| |1000| +|defaultRetentionTimeInMinutes| |0| +|defaultRetentionSizeInMB| |0| +|keepAliveIntervalSeconds| |30| +|brokerServicePurgeInactiveFrequencyInSeconds| |60| + + + + + +## WebSocket + +|Name|Description|Default| +|---|---|---| +|globalZookeeperServers ||| +|zooKeeperSessionTimeoutMillis| |30000| +|serviceUrl||| +|serviceUrlTls||| +|brokerServiceUrl||| +|brokerServiceUrlTls||| +|webServicePort||8080| +|webServicePortTls||8443| +|bindAddress||0.0.0.0| +|clusterName ||| +|authenticationEnabled||false| +|authenticationProviders||| +|authorizationEnabled||false| +|superUserRoles ||| +|brokerClientAuthenticationPlugin||| +|brokerClientAuthenticationParameters||| +|tlsEnabled||false| +|tlsAllowInsecureConnection||false| +|tlsCertificateFilePath||| +|tlsKeyFilePath ||| +|tlsTrustCertsFilePath||| + + +## Pulsar proxy + +The [Pulsar proxy](getting-started-concepts-and-architecture.md#pulsar-proxy) can be configured in the `conf/proxy.conf` file. + + +|Name|Description|Default| +|---|---|---| +|zookeeperServers| The ZooKeeper quorum connection string (as a comma-separated list) || +|configurationStoreServers| Configuration store connection string (as a comma-separated list) || +|zookeeperSessionTimeoutMs| ZooKeeper session timeout (in milliseconds) |30000| +|servicePort| The port to use for server binary Protobuf requests |6650| +|servicePortTls| The port to use to server binary Protobuf TLS requests |6651| +|statusFilePath| Path for the file used to determine the rotation status for the proxy instance when responding to service discovery health checks || +|authenticationEnabled| Whether authentication is enabled for the Pulsar proxy |false| +|authenticationProviders| Authentication provider name list (a comma-separated list of class names) || +|authorizationEnabled| Whether authorization is enforced by the Pulsar proxy |false| +|authorizationProvider| Authorization provider as a fully qualified class name |org.apache.pulsar.broker.authorization.PulsarAuthorizationProvider| +|brokerClientAuthenticationPlugin| The authentication plugin used by the Pulsar proxy to authenticate with Pulsar brokers || +|brokerClientAuthenticationParameters| The authentication parameters used by the Pulsar proxy to authenticate with Pulsar brokers || +|brokerClientTrustCertsFilePath| The path to trusted certificates used by the Pulsar proxy to authenticate with Pulsar brokers || +|superUserRoles| Role names that are treated as “super-users,” meaning that they will be able to perform all admin || +|forwardAuthorizationCredentials| Whether client authorization credentials are forwared to the broker for re-authorization. Authentication must be enabled via authenticationEnabled=true for this to take effect. |false| +|maxConcurrentInboundConnections| Max concurrent inbound connections. The proxy will reject requests beyond that. |10000| +|maxConcurrentLookupRequests| Max concurrent outbound connections. The proxy will error out requests beyond that. |10000| +|tlsEnabledInProxy| Whether TLS is enabled for the proxy |false| +|tlsEnabledWithBroker| Whether TLS is enabled when communicating with Pulsar brokers |false| +|tlsCertificateFilePath| Path for the TLS certificate file || +|tlsKeyFilePath| Path for the TLS private key file || +|tlsTrustCertsFilePath| Path for the trusted TLS certificate pem file || +|tlsHostnameVerificationEnabled| Whether the hostname is validated when the proxy creates a TLS connection with brokers |false| +|tlsRequireTrustedClientCertOnConnect| Whether client certificates are required for TLS. Connections are rejected if the client certificate isn’t trusted. |false| + + +## ZooKeeper + +ZooKeeper handles a broad range of essential configuration- and coordination-related tasks for Pulsar. The default configuration file for ZooKeeper is in the `conf/zookeeper.conf` file in your Pulsar installation. The following parameters are available: + + +|Name|Description|Default| +|---|---|---| +|tickTime| The tick is the basic unit of time in ZooKeeper, measured in milliseconds and used to regulate things like heartbeats and timeouts. tickTime is the length of a single tick. |2000| +|initLimit| The maximum time, in ticks, that the leader ZooKeeper server allows follower ZooKeeper servers to successfully connect and sync. The tick time is set in milliseconds using the tickTime parameter. |10| +|syncLimit| The maximum time, in ticks, that a follower ZooKeeper server is allowed to sync with other ZooKeeper servers. The tick time is set in milliseconds using the tickTime parameter. |5| +|dataDir| The location where ZooKeeper will store in-memory database snapshots as well as the transaction log of updates to the database. |data/zookeeper| +|clientPort| The port on which the ZooKeeper server will listen for connections. |2181| +|autopurge.snapRetainCount| In ZooKeeper, auto purge determines how many recent snapshots of the database stored in dataDir to retain within the time interval specified by autopurge.purgeInterval (while deleting the rest). |3| +|autopurge.purgeInterval| The time interval, in hours, by which the ZooKeeper database purge task is triggered. Setting to a non-zero number will enable auto purge; setting to 0 will disable. Read this guide before enabling auto purge. |1| +|maxClientCnxns| The maximum number of client connections. Increase this if you need to handle more ZooKeeper clients. |60| + + + + +In addition to the parameters in the table above, configuring ZooKeeper for Pulsar involves adding +a `server.N` line to the `conf/zookeeper.conf` file for each node in the ZooKeeper cluster, where `N` is the number of the ZooKeeper node. Here's an example for a three-node ZooKeeper cluster: + +```properties +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=zk3.us-west.example.com:2888:3888 +``` + +> We strongly recommend consulting the [ZooKeeper Administrator's Guide](https://zookeeper.apache.org/doc/current/zookeeperAdmin.html) for a more thorough and comprehensive introduction to ZooKeeper configuration diff --git a/site2/docs/reference-pulsar-admin.md b/site2/docs/reference-pulsar-admin.md new file mode 100644 index 0000000000000000000000000000000000000000..565411ae6f85550390c63c35f83efb7dba3793e5 --- /dev/null +++ b/site2/docs/reference-pulsar-admin.md @@ -0,0 +1,1625 @@ +--- +id: pulsar-admin +title: Pulsar admin CLI +sidebar_label: Pulsar Admin CLI +--- + +The `pulsar-admin` tool enables you to manage Pulsar installations, including clusters, brokers, namespaces, tenants, and more. + +Usage +```bash +$ pulsar-admin command +``` + +Commands +* `broker-stats` +* `brokers` +* `clusters` +* `functions` +* `namespaces` +* `ns-isolation-policy` +* `sink` +* `source` +* `topics` +* `tenants` +* `resource-quotas` +* `schemas` + +## `broker-stats` + +Operations to collect broker statistics + +```bash +$ pulsar-admin broker-stats subcommand +``` + +Subcommands +* `allocator-stats` +* `destinations` +* `mbeans` +* `monitoring-metrics` +* `topics` + + +### `allocator-stats` + +Dump allocator stats + +Usage +```bash +$ pulsar-admin broker-stats allocator-stats allocator-name +``` + +### `desinations` + +Dump topic stats + +Usage +```bash +$ pulsar-admin broker-stats destinations options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-i`, `--indent`|Indent JSON output|false| + +### `mbeans` + +Dump Mbean stats + +Usage +```bash +$ pulsar-admin broker-stats mbeans options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-i`, `--indent`|Indent JSON output|false| + + +### `monitoring-metrics` + +Dump metrics for monitoring + +Usage +```bash +$ pulsar-admin broker-stats monitoring-metrics options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-i`, `--indent`|Indent JSON output|false| + + +### `topics` + +Dump topic stats + +Usage +```bash +$ pulsar-admin broker-stats topics options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-i`, `--indent`|Indent JSON output|false| + + +## `brokers` + +Operations about brokers + +```bash +$ pulsar-admin brokers subcommand +``` + +Subcommands +* `list` +* `namespaces` +* `update-dynamic-config` +* `list-dynamic-config` +* `get-all-dynamic-config` +* `get-internal-config` + +### `list` +List active brokers of the cluster + +Usage +```bash +$ pulsar-admin brokers list cluster-name +``` + +### `namespaces` +List namespaces owned by the broker + +Usage +```bash +$ pulsar-admin brokers namespaces cluster-name options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--url`|The URL for the broker|| + + +### `update-dynamic-config` +Update a broker's dynamic service configuration + +Usage +```bash +$ pulsar-admin brokers update-dynamic-config options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--config`|Service configuration parameter name|| +|`--value`|Value for the configuration parameter value specified using the `--config` flag|| + + +### `list-dynamic-config` +Get list of updatable configuration name + +Usage +```bash +$ pulsar-admin brokers list-dynamic-config +``` + +### `get-all-dynamic-config` +Get all overridden dynamic-configuration values + +Usage +```bash +$ pulsar-admin brokers get-all-dynamic-config +``` + +### `get-internal-config` +Get internal configuration information + +Usage +```bash +$ pulsar-admin brokers get-internal-config +``` + + +## `clusters` +Operations about clusters + +Usage +```bash +$ pulsar-admin clusters subcommand +``` + +Subcommands +* `get` +* `create` +* `update` +* `delete` +* `list` +* `update-peer-clusters` + + +### `get` +Get the configuration data for the specified cluster + +Usage +```bash +$ pulsar-admin clusters get cluster-name +``` + +### `create` +Provisions a new cluster. This operation requires Pulsar super-user privileges. + +Usage +```bash +$ pulsar-admin clusters create cluster-name options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--broker-url`|The URL for the broker service.|| +|`--broker-url-secure`|The broker service URL for a secure connection|| +|`--url`|service-url|| +|`--url-secure`|service-url for secure connection|| + + +### `update` +Update the configuration for a cluster + +Usage +```bash +$ pulsar-admin clusters update cluster-name options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--broker-url`|The URL for the broker service.|| +|`--broker-url-secure`|The broker service URL for a secure connection|| +|`--url`|service-url|| +|`--url-secure`|service-url for secure connection|| + + +### `delete` +Deletes an existing cluster + +Usage +```bash +$ pulsar-admin clusters delete cluster-name +``` + +### `list` +List the existing clusters + +Usage +```bash +$ pulsar-admin clusters list +``` + +### `update-peer-clusters` +Update peer cluster names + +Usage +```bash +$ pulsar-admin clusters update-peer-clusters peer-cluster-names +``` + +## `functions` + +A command-line interface for Pulsar Functions + +Usage +```bash +$ pulsar-admin functions subcommand +``` + +Subcommands +* `localrun` +* `create` +* `delete` +* `update` +* `get` +* `getstatus` +* `list` +* `querystate` +* `trigger` + + +### `localrun` +Run a Pulsar Function locally + + +Usage +```bash +$ pulsar-admin functions localrun options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--cpu`|The CPU to allocate to each function instance (in number of cores)|| +|`--ram`|The RAM to allocate to each function instance (in bytes)|| +|`--disk`|The disk space to allocate to each function instance (in bytes)|| +|`--brokerServiceUrl `|The URL of the Pulsar broker|| +|`--className`|The name of the function’s class|| +|`--customSerdeInputs`|A map of the input topic to SerDe name|| +|`--functionConfigFile`|The path of the YAML config file used to configure the function|| +|`--inputs`|The input topics for the function (as a comma-separated list if more than one topic is desired)|| +|`--logTopic`|The topic to which logs from this function are published|| +|`--jar`|A path to the JAR file for the function (if the function is written in Java)|| +|`--name`|The name of the function|The value specified by --className| +|`--namespace`|The function’s namespace|| +|`--output`|The name of the topic to which the function publishes its output (if any)|| +|`--outputSerdeClassName`|The SerDe class used for the function’s output|| +|`--parallelism`|The function’s parallelism factor, i.e. the number of instances of the function to run|1| +|`--processingGuarantees`|The processing guarantees applied to the function. Can be one of: ATLEAST_ONCE, ATMOST_ONCE, or EFFECTIVELY_ONCE|ATLEAST_ONCE| +|`--py`|The path of the Python file containing the function’s processing logic (if the function is written in Python)|| +|`--stateStorageServiceUrl`|The service URL for the function’s state storage (if the function uses a storage system different from the Apache BookKeeper cluster used by Pulsar)|| +|`--subscriptionType`|The subscription type used by the function when consuming messages on the input topic(s). Can be either SHARED or EXCLUSIVE|SHARED| +|`--tenant`|The function’s tenant|| +|`--userConfig`|A user-supplied config value, set as a key/value pair. You can set multiple user config values.|| + + +### `create` +Creates a new Pulsar Function on the target infrastructure + +Usage +``` +$ pulsar-admin functions create options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--cpu`|The CPU to allocate to each function instance (in number of cores)|| +|`--ram`|The RAM to allocate to each function instance (in bytes)|| +|`--disk`|The disk space to allocate to each function instance (in bytes)|| +|`--brokerServiceUrl `|The URL of the Pulsar broker|| +|`--className`|The name of the function’s class|| +|`--customSerdeInputs`|A map of the input topic to SerDe name|| +|`--functionConfigFile`|The path of the YAML config file used to configure the function|| +|`--inputs`|The input topics for the function (as a comma-separated list if more than one topic is desired)|| +|`--logTopic`|The topic to which logs from this function are published|| +|`--jar`|A path to the JAR file for the function (if the function is written in Java)|| +|`--name`|The name of the function|The value specified by --className| +|`--namespace`|The function’s namespace|| +|`--output`|The name of the topic to which the function publishes its output (if any)|| +|`--outputSerdeClassName`|The SerDe class used for the function’s output|| +|`--parallelism`|The function’s parallelism factor, i.e. the number of instances of the function to run|1| +|`--processingGuarantees`|The processing guarantees applied to the function. Can be one of: ATLEAST_ONCE, ATMOST_ONCE, or EFFECTIVELY_ONCE|ATLEAST_ONCE| +|`--py`|The path of the Python file containing the function’s processing logic (if the function is written in Python)|| +|`--stateStorageServiceUrl`|The service URL for the function’s state storage (if the function uses a storage system different from the Apache BookKeeper cluster used by Pulsar)|| +|`--subscriptionType`|The subscription type used by the function when consuming messages on the input topic(s). Can be either SHARED or EXCLUSIVE|SHARED| +|`--tenant`|The function’s tenant|| +|`--userConfig`|A user-supplied config value, set as a key/value pair. You can set multiple user config values.|| + + +### `delete` +Deletes an existing Pulsar Function + +Usage +```bash +$ pulsar-admin functions delete options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--name`|The name of the function to delete|| +|`--namespace`|The namespace of the function to delete|| +|`--tenant`|The tenant of the function to delete|| + + +### `update` +Updates an existing Pulsar Function + +Usage +```bash +$ pulsar-admin functions update options +``` + + +Options +|Flag|Description|Default| +|---|---|---| +|`--className`|The name of the function’s class|| +|`--customSerdeInputs`|A map of the input topic to SerDe name|| +|`--functionConfigFile`|The path of the YAML config file used to configure the function|| +|`--inputs`|The input topics for the function (as a comma-separated list if more than one topic is desired)|| +|`--logTopic`|The topic to which logs from this function are published|| +|`--jar`|A path to the JAR file for the function (if the function is written in Java)|| +|`--name`|The name of the function|The value specified by --className| +|`--namespace`|The function’s namespace|| +|`--output`|The name of the topic to which the function publishes its output (if any)|| +|`--outputSerdeClassName`|The SerDe class used for the function’s output|| +|`--parallelism`|The function’s parallelism factor, i.e. the number of instances of the function to run|1| +|`--processingGuarantees`|The processing guarantees applied to the function. Can be one of: ATLEAST_ONCE, ATMOST_ONCE, or EFFECTIVELY_ONCE|ATLEAST_ONCE| +|`--py`|The path of the Python file containing the function’s processing logic (if the function is written in Python)|| +|`--subscriptionType`|The subscription type used by the function when consuming messages on the input topic(s). Can be either SHARED or EXCLUSIVE|SHARED| +|`--tenant`|The function’s tenant|| +|`--userConfig`|A user-supplied config value, set as a key/value pair. You can set multiple user config values.|| + +### `get` +Fetch information about an existing Pulsar Function + +Usage +```bash +$ pulsar-admin functions get options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--name`|The name of the function to delete|| +|`--namespace`|The namespace of the function to delete|| +|`--tenant`|The tenant of the function to delete|| + + +### `getstatus` +Get the status of an existing Pulsar Function + +Usage +```bash +$ pulsar-admin functions getstatus options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--name`|The name of the function to delete|| +|`--namespace`|The namespace of the function to delete|| +|`--tenant`|The tenant of the function to delete|| + + +### `list` +List all Pulsar Functions for a specific tenant and namespace + +Usage +```bash +$ pulsar-admin functions list options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--namespace`|The namespace of the function to delete|| +|`--tenant`|The tenant of the function to delete|| + + +### `querystate` +Retrieve the current state of a Pulsar Function by key + +Usage +```bash +$ pulsar-admin functions querystate options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-k`, `--key`|The key for the state you want to fetch|| +|`--name`|The name of the function whose state you want to query|| +|`--namespace`|The namespace of the function whose state you want to query|| +|`--tenant`|The tenant of the function whose state you want to query|| +|`-u`, `--storage-service-url`|The service URL for the function’s state storage (if the function uses a storage system different from the Apache BookKeeper cluster used by Pulsar)|| +|`-w`, `--watch`|If set, watching for state changes is enabled|false| + + +### `trigger` +Triggers the specified Pulsar Function with a supplied value or file data + +Usage +```bash +$ pulsar-admin functions trigger options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--name`|The name of the Pulsar Function to trigger|| +|`--namespace`|The namespace of the Pulsar Function to trigger|| +|`--tenant`|The tenant of the Pulsar Function to trigger|| +|`--triggerFile`|The path to the file containing the data with which the Pulsar Function is to be triggered|| +|`--triggerValue`|The value with which the Pulsar Function is to be triggered|| + + +## `namespaces` + +Operations for managing namespaces + + +```bash +$ pulsar-admin namespaces subcommand +``` + +Subcommands +* `list` +* `list-cluster` +* `destinations` +* `policies` +* `create` +* `delete` +* `set-deduplication` +* `permissions` +* `grant-permission` +* `revoke-permission` +* `set-clusters` +* `get-clusters` +* `get-backlog-quotas` +* `set-backlog-quota` +* `remove-backlog-quota` +* `get-persistence` +* `set-persistence` +* `get-message-ttl` +* `set-message-ttl` +* `get-retention` +* `set-retention` +* `unload` +* `clear-backlog` +* `unsubscribe` +* `get-compaction-threshold` +* `set-compaction-threshold` +* `get-offload-threshold` +* `set-offload-threshold` + + +### `list` +Get the namespaces for a tenant + +Usage +```bash +$ pulsar-admin namespaces list tenant-name +``` + +### `list-cluster` +Get the namespaces for a tenant in the cluster + +Usage +```bash +$ pulsar-admin namespaces list-cluster tenant/cluster +``` + +### `destinations` +Get the destinations for a namespace + +Usage +```bash +$ pulsar-admin namespaces destinations tenant/cluster/namespace +``` + +### `policies` +Get the policies of a namespace + +Usage +```bash +$ pulsar-admin namespaces policies tenant/cluster/namespace +``` + +### `create` +Create a new namespace + +Usage +```bash +$ pulsar-admin namespaces create tenant/cluster/namespace options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-b` , `--bundles`|The number of bundles to activate|0| + + +### `delete` +Deletes a namespace + +Usage +```bash +$ pulsar-admin namespaces delete tenant/cluster/namespace +``` + +### `set-deduplication` +Enable or disable message deduplication on a namespace + +Usage +```bash +$ pulsar-admin namespaces set-deduplication tenant/cluster/namespace options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--enable`, `-e`|Enable message deduplication on the specified namespace|false| +|`--disable`, `-d`|Disable message deduplication on the specified namespace|false| + + +### `permissions` +Get the permissions on a namespace + +Usage +```bash +$ pulsar-admin namespaces permissions tenant/cluster/namespace +``` + +### `grant-permission` +Grant permissions on a namespace + +Usage +```bash +$ pulsar-admin namespaces grant-permission tenant/cluster/namespace options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--actions`|Actions to be granted (`produce` or `consume`)|| +|`--role`|The client role to which to grant the permissions|| + + +### `revoke-permission` +Revoke permissions on a namespace + +Usage +```bash +$ pulsar-admin namespaces revoke-permission tenant/cluster/namespace options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--role`|The client role to which to grant the permissions|| + + +### `set-clusters` +Set replication clusters for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-clusters tenant/cluster/namespace options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-c`, `--clusters`|Replication clusters ID list (comma-separated values)|| + + +### `get-clusters` +Get replication clusters for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-clusters tenant/cluster/namespace +``` + +### `get-backlog-quotas` +Get the backlog quota policies for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-backlog-quotas tenant/cluster/namespace +``` + +### `set-backlog-quota` +Set a backlog quota for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-backlog-quota tenant/cluster/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-l`, `--limit`|The backlog size limit (for example `10M` or `16G`)|| +|`-p`, `--policy`|The retention policy to enforce when the limit is reached. The valid options are: `producer_request_hold`, `producer_exception` or `consumer_backlog_eviction`| + +Example +```bash +$ pulsar-admin namespaces set-backlog-quota my-prop/my-cluster/my-ns \ +--limit 2G \ +--policy producer_request_hold +``` + +### `remove-backlog-quota` +Remove a backlog quota policy from a namespace + +Usage +```bash +$ pulsar-admin namespaces remove-backlog-quota tenant/cluster/namespace +``` + +### `get-persistence` +Get the persistence policies for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-persistence tenant/cluster/namespace +``` + +### `set-persistence` +Set the persistence policies for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-persistence tenant/cluster/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-a`, `--bookkeeper-ack-quorom`|The number of acks (guaranteed copies) to wait for each entry|0| +|`-e`, `--bookkeeper-ensemble`|The number of bookies to use for a topic|0| +|`-w`, `--bookkeeper-write-quorum`|How many writes to make of each entry|0| +|`-r`, `--ml-mark-delete-max-rate`|Throttling rate of mark-delete operation (0 means no throttle)|| + + +### `get-message-ttl` +Get the message TTL for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-message-ttl tenant/cluster/namespace +``` + +### `set-message-ttl` +Set the message TTL for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-message-ttl options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-ttl`, `--messageTTL`|Message TTL in seconds|0| + + +### `get-retention` +Get the retention policy for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-retention tenant/cluster/namespace +``` + +### `set-retention` +Set the retention policy for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-retention tenant/cluster/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-s`, `--size`|The retention size limits (for example 10M, 16G or 3T). 0 means no retention and -1 means infinite size retention|| +|`-t`, `--time`|The retention time in minutes, hours, days, or weeks. Examples: 100m, 13h, 2d, 5w. 0 means no retention and -1 means infinite time retention|| + + +### `unload` +Unload a namespace or namespace bundle from the current serving broker. + +Usage +```bash +$ pulsar-admin namespaces unload tenant/cluster/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`||| + + +### `clear-backlog` +Clear the backlog for a namespace + +Usage +```bash +$ pulsar-admin namespaces clear-backlog tenant/cluster/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`||| +|`-f`, `--force`|Whether to force a clear backlog without prompt|false| +|`-s`, `--sub`|The subscription name|| + + +### `unsubscribe` +Unsubscribe the given subscription on all destinations on a namespace + +Usage +```bash +$ pulsar-admin namespaces unsubscribe tenant/cluster/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`||| +|`-s`, `--sub`|The subscription name|| + + +### `get-compaction-threshold` +Get compactionThreshold for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-compaction-threshold tenant/namespace +``` + +### `set-compaction-threshold` +Set compactionThreshold for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-compaction-threshold tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-t`, `--threshold`|Maximum number of bytes in a topic backlog before compaction is triggered (eg: 10M, 16G, 3T). 0 disables automatic compaction|0| + + +### `get-offload-threshold` +Get offloadThreshold for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-offload-threshold tenant/namespace +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-s`, `--size`|Maximum number of bytes stored in the pulsar cluster for a topic before data will start being automatically offloaded to longterm storage (eg: 10M, 16G, 3T, 100). Negative values disable automatic offload. 0 triggers offloading as soon as possible.|-1| + + + +## `ns-isolation-policy` +Operations for managing namespace isolation policies. + +Usage +```bash +$ pulsar-admin ns-isolation-policy subcommand +``` + +Subcommands +* `set` +* `get` +* `list` +* `delete` + +### `set` +Create/update a namespace isolation policy for a cluster. This operation requires Pulsar superuser privileges. + +Usage +```bash +$ pulsar-admin ns-isolation-policy set cluster-name policy-name options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`--auto-failover-policy-params`|Comma-separated name=value auto failover policy parameters|[]| +|`--auto-failover-policy-type`|Auto failover policy type name. Currently available options: min_available.|[]| +|`--namespaces`|Comma-separated namespaces regex list|[]| +|`--primary`|Comma-separated primary broker regex list|[]| +|`--secondary`|Comma-separated secondary broker regex list|[]| + + +### `get` +Get the namespace isolation policy of a cluster. This operation requires Pulsar superuser privileges. + +Usage +```bash +$ pulsar-admin ns-isolation-policy get cluster-name policy-name +``` + +### `list` +List all namespace isolation policies of a cluster. This operation requires Pulsar superuser privileges. + +Usage +```bash +$ pulsar-admin ns-isolation-policy list cluster-name +``` + +### `delete` +Delete namespace isolation policy of a cluster. This operation requires superuser privileges. + +Usage +```bash +$ pulsar-admin ns-isolation-policy delete +``` + + +## `sink` + +An interface for managing Pulsar IO sinks (egress data from Pulsar) + +Usage +```bash +$ pulsar-admin sink subcommand +``` + +Subcommands +* `create` +* `delete` +* `localrun` + + +### `create` +Submit a Pulsar IO sink connector to run in a Pulsar cluster + +Usage +```bash +$ pulsar-admin sink create options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`--className`|The sink’s Java class name|| +|`--cpu`|The CPU (in cores) that needs to be allocated per sink instance (applicable only to the Docker runtime)|| +|`--customSerdeInputs`|The map of input topics to SerDe class names (as a JSON string)|| +|`--disk`|The disk (in bytes) that needs to be allocated per sink instance (applicable only to the Docker runtime)|| +|`--inputs`|The sink’s input topic(s) (multiple topics can be specified as a comma-separated list)|| +|`--jar`|Path to the Java jar file for the sink|| +|`--name`|The sink’s name|| +|`--namespace`|The sink’s namespace|| +|`--parallelism`|“The sink’s parallelism factor (i.e. the number of sink instances to run).”|| +|`--processingGuarantees`|“The processing guarantees (aka delivery semantics) applied to the sink. Available values: ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE.”|| +|`--ram`|The RAM (in bytes) that needs to be allocated per sink instance (applicable only to the Docker runtime)|| +|`--sinkConfig`|Sink config key/values|| +|`--sinkConfigFile`|The path to a YAML config file specifying the sink’s configuration|| +|`--tenant`|The sink’s tenant|| + + +### `delete` +Stops a Pulsar IO sink + +Usage +```bash +$ pulsar-admin sink delete options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--name`|The name of the function to delete|| +|`--namespace`|The namespace of the function to delete|| +|`--tenant`|The tenant of the function to delete|| + + +### `localrun` +Run the Pulsar sink locally (rather than in the Pulsar cluster) + +Usage +```bash +$ pulsar-admin sink localrun options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`--brokerServiceUrl`|The URL for the Pulsar broker|| +|`--className`|The sink’s Java class name|| +|`--cpu`|The CPU (in cores) that needs to be allocated per sink instance (applicable only to the Docker runtime)|| +|`--customSerdeInputs`|The map of input topics to SerDe class names (as a JSON string)|| +|`--disk`|The disk (in bytes) that needs to be allocated per sink instance (applicable only to the Docker runtime)|| +|`--inputs`|The sink’s input topic(s) (multiple topics can be specified as a comma-separated list)|| +|`--jar`|Path to the Java jar file for the sink|| +|`--name`|The sink’s name|| +|`--namespace`|The sink’s namespace|| +|`--parallelism`|“The sink’s parallelism factor (i.e. the number of sink instances to run).”|| +|`--processingGuarantees`|“The processing guarantees (aka delivery semantics) applied to the sink. Available values: ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE.”|| +|`--ram`|The RAM (in bytes) that needs to be allocated per sink instance (applicable only to the Docker runtime)|| +|`--sinkConfig`|Sink config key/values|| +|`--sinkConfigFile`|The path to a YAML config file specifying the sink’s configuration|| +|`--tenant`|The sink’s tenant|| + + + +## `source` +An interface for managing Pulsar IO sources (ingress data into Pulsar) + +Usage +```bash +$ pulsar-admin source subcommand +``` + +Subcommands +* `create` +* `delete` +* `localrun` + + +### `create` +Submit a Pulsar IO source connector to run in a Pulsar cluster + +Usage +```bash +$ pulsar-admin source create options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`--className`|The source’s Java class name|| +|`--cpu`|The CPU (in cores) that needs to be allocated per source instance (applicable only to the Docker runtime)|| +|`--deserializationClassName`|The SerDe classname for the source|| +|`--destinationTopicName`|The Pulsar topic to which data is sent|| +|`--disk`|The disk (in bytes) that needs to be allocated per source instance (applicable only to the Docker runtime)|| +|`--jar`|Path to the Java jar file for the source|| +|`--name`|The source’s name|| +|`--namespace`|The source’s namespace|| +|`--parallelism`|The source’s parallelism factor (i.e. the number of source instances to run).|| +|`--processingGuarantees`|“The processing guarantees (aka delivery semantics) applied to the source. Available values: ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE.”|| +|`--ram`|The RAM (in bytes) that needs to be allocated per source instance (applicable only to the Docker runtime)|| +|`--sourceConfig`|Source config key/values|| +|`--sourceConfigFile`|The path to a YAML config file specifying the source’s configuration|| +|`--tenant`|The source’s tenant|| + + +### `delete` +Stops a Pulsar IO source + +Usage +```bash +$ pulsar-admin source delete options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--name`|The name of the function to delete|| +|`--namespace`|The namespace of the function to delete|| +|`--tenant`|The tenant of the function to delete|| + + +### `localrun` +Run the Pulsar source locally (rather than in the Pulsar cluster) + +Usage +```bash +$ pulsar-admin source localrun options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`--className`|The source’s Java class name|| +|`--cpu`|The CPU (in cores) that needs to be allocated per source instance (applicable only to the Docker runtime)|| +|`--deserializationClassName`|The SerDe classname for the source|| +|`--destinationTopicName`|The Pulsar topic to which data is sent|| +|`--disk`|The disk (in bytes) that needs to be allocated per source instance (applicable only to the Docker runtime)|| +|`--jar`|Path to the Java jar file for the source|| +|`--name`|The source’s name|| +|`--namespace`|The source’s namespace|| +|`--parallelism`|The source’s parallelism factor (i.e. the number of source instances to run).|| +|`--processingGuarantees`|“The processing guarantees (aka delivery semantics) applied to the source. Available values: ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE.”|| +|`--ram`|The RAM (in bytes) that needs to be allocated per source instance (applicable only to the Docker runtime)|| +|`--sourceConfig`|Source config key/values|| +|`--sourceConfigFile`|The path to a YAML config file specifying the source’s configuration|| +|`--tenant`|The source’s tenant|| + + + +## `topics` +Operations for managing Pulsar topics (both persistent and non persistent) + +Usage +```bash +$ pulsar-admin topics subcommand +``` + +Subcommands +* `compact` +* `compaction-status` +* `offload` +* `offload-status` +* `create-partitioned-topic` +* `delete-partitioned-topic` +* `get-partitioned-topic-metadata` +* `list` +* `list-in-bundle` +* `terminate` +* `permissions` +* `grant-permission` +* `revoke-permission` +* `lookup` +* `bundle-range` +* `delete` +* `unload` +* `subscriptions` +* `unsubscribe` +* `stats` +* `stats-internal` +* `info-internal` +* `partitioned-stats` +* `skip` +* `skip-all` +* `expire-messages` +* `expire-messages-all-subscriptions` +* `peek-messages` +* `reset-cursor` + + +### `compact` +Run compaction on the specified topic (persistent topics only) + +Usage +``` +$ pulsar-admin topics compact persistent://tenant/namespace/topic +``` + +### `compaction-status` +Check the status of a topic compaction (persistent topics only) + +Usage +```bash +$ pulsar-admin topics compaction-status persistent://tenant/namespace/topic +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-w`, `--wait-complete`|Wait for compaction to complete|false| + + +### `offload` +Trigger offload of data from a topic to long-term storage (e.g. Amazon S3) + +Usage +```bash +$ pulsar-admin topics offload persistent://tenant/namespace/topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-s`, `--size-threshold`|The maximum amount of data to keep in BookKeeper for the specific topic|| + + +### `offload-status` +Check the status of data offloading from a topic to long-term storage + +Usage +```bash +$ pulsar-admin topics offload-status persistent://tenant/namespace/topic op +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-w`, `--wait-complete`|Wait for compaction to complete|false| + + +### `create-partitioned-topic` +Create a partitioned topic. A partitioned topic must be created before producers can publish to it. + +Usage +```bash +$ pulsar-admin topics create-partitioned-topic {persistent|non-persistent}://tenant/namespace/topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-p`, `--partitions`|The number of partitions for the topic|0| + + +### `delete-partitioned-topic` +Delete a partitioned topic. This will also delete all the partitions of the topic if they exist. + +Usage +```bash +$ pulsar-admin topics delete-partitioned-topic {persistent|non-persistent} +``` + +### `get-partitioned-topic-metadata` +Get the partitioned topic metadata. If the topic is not created or is a non-partitioned topic, this will return an empty topic with zero partitions. + +Usage +```bash +$ pulsar-admin topics get-partitioned-topic-metadata {persistent|non-persistent}://tenant/namespace/topic +``` + +### `list` +Get the list of topics under a namespace + +Usage +``` +$ pulsar-admin topics list tenant/cluster/namespace +``` + +### `list-in-bundle` +Get a list of non-persistent topics present under a namespace bundle + +Usage +``` +$ pulsar-admin topics list-in-bundle tenant/namespace options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-b`, `--bundle`|The bundle range|| + + +### `terminate` +Terminate a topic (disallow further messages from being published on the topic) + +Usage +```bash +$ pulsar-admin topics terminate {persistent|non-persistent}://tenant/namespace/topic +``` + +### `permissions` +Get the permissions on a topic. Retrieve the effective permissions for a desination. These permissions are defined by the permissions set at the namespace level combined (union) with any eventual specific permissions set on the topic. + +Usage +```bash +$ pulsar-admin topics permissions topic +``` + +### `grant-permission` +Grant a new permission to a client role on a single topic + +Usage +```bash +$ pulsar-admin topics grant-permission {persistent|non-persistent}://tenant/namespace/topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--actions`|Actions to be granted (`produce` or `consume`)|| +|`--role`|The client role to which to grant the permissions|| + + +### `revoke-permission` +Revoke permissions to a client role on a single topic. If the permission was not set at the topic level, but rather at the namespace level, this operation will return an error (HTTP status code 412). + +Usage +```bash +$ pulsar-admin topics revoke-permission topic +``` + +### `lookup` +Look up a topic from the current serving broker + +Usage +```bash +$ pulsar-admin topics lookup topic +``` + +### `bundle-range` +Get the namespace bundle which contains the given topic + +Usage +```bash +$ pulsar-admin topics bundle-range topic +``` + +### `delete` +Delete a topic. The topic cannot be deleted if there are any active subscriptions or producers connected to the topic. + +Usage +```bash +$ pulsar-admin topics delete topic +``` + +### `unload` +Unload a topic + +Usage +```bash +$ pulsar-admin topics unload topic +``` + +### `subscriptions` +Get the list of subscriptions on the topic + +Usage +```bash +$ pulsar-admin topics subscriptions topic +``` + +### `unsubscribe` +Delete a durable subscriber from a topic + +Usage +```bash +$ pulsar-admin topics unsubscribe topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-s`, `--subscription`|The subscription to delete|| + + +### `stats` +Get the stats for the topic and its connected producers and consumers. All rates are computed over a 1-minute window and are relative to the last completed 1-minute period. + +Usage +```bash +$ pulsar-admin topics stats topic +``` + +### `stats-internal` +Get the internal stats for the topic + +Usage +```bash +$ pulsar-admin topics stats-internal topic +``` + +### `info-internal` +Get the internal metadata info for the topic + +Usage +```bash +$ pulsar-admin topics info-internal topic +``` + +### `partitioned-stats` +Get the stats for the partitioned topic and its connected producers and consumers. All rates are computed over a 1-minute window and are relative to the last completed 1-minute period. + +Usage +```bash +$ pulsar-admin topics partitioned-stats topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--per-partition`|Get per-partition stats|false| + + +### `skip` +Skip some messages for the subscription + +Usage +```bash +$ pulsar-admin topics skip topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-n`, `--count`|The number of messages to skip|0| +|`-s`, `--subscription`|The subscription on which to skip messages|| + + +### `skip-all` +Skip all the messages for the subscription + +Usage +```bash +$ pulsar-admin topics skip-all topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-s`, `--subscription`|The subscription to clear|| + + +### `expire-messages` +Expire messages that are older than the given expiry time (in seconds) for the subscription. + +Usage +```bash +$ pulsar-admin topics expire-messages topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-t`, `--expireTime`|Expire messages older than the time (in seconds)|0| +|`-s`, `--subscription`|The subscription to skip messages on|| + + +### `expire-messages-all-subscriptions` +Expire messages older than the given expiry time (in seconds) for all subscriptions + +Usage +```bash +$ pulsar-admin topics expire-messages-all-subscriptions topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-t`, `--expireTime`|Expire messages older than the time (in seconds)|0| + + +### `peek-messages` +Peek some messages for the subscription. + +Usage +```bash +$ pulsar-admin topics peek-messages topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-n`, `--count`|The number of messages|0| +|`-s`, `--subscription`|Subscription to get messages from|| + + +### `reset-cursor` +Reset position for subscription to closest to timestamp + +Usage +```bash +$ pulsar-admin topics reset-cursor topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-s`, `--subscription`|Subscription to reset position on|| +|`-t`, `--time`|The time, in minutes, to reset back to (or minutes, hours, days, weeks, etc.). Examples: `100m`, `3h`, `2d`, `5w`.|| + + + +## `tenants` +Operations for managing tenants + +Usage +```bash +$ pulsar-admin tenants subcommand +``` + +Subcommands +* `list` +* `get` +* `create` +* `update` +* `delete` + +### `list` +List the existing tenants + +Usage +```bash +$ pulsar-admin tenants list +``` + +### `get` +Gets the configuration of a tenant + +Usage +```bash +$ pulsar-admin tenants get tenant-name +``` + +### `create` +Creates a new tenant + +Usage +```bash +$ pulsar-admin tenants create tenant-name options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-r`, `--admin-roles`|Comma-separated admin roles|| +|`-c`, `--allowed-clusters`|Comma-separated allowed clusters|| + +### `update` +Updates a tenant + +Usage +```bash +$ pulsar-admin tenants update tenant-name options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-r`, `--admin-roles`|Comma-separated admin roles|| +|`-c`, `--allowed-clusters`|Comma-separated allowed clusters|| + + +### `delete` +Deletes an existing tenant + +Usage +```bash +$ pulsar-admin tenants delete tenant-name +``` + + +## `resource-quotas` +Operations for managing resource quotas + +Usage +```bash +$ pulsar-admin resource-quotas subcommand +``` + +Subcommands +* `get` +* `set` +* `reset-namespace-bundle-quota` + + +### `get` +Get the resource quota for a specified namespace bundle, or default quota if no namespace/bundle is specified. + +Usage +```bash +$ pulsar-admin resource-quotas get options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|A bundle of the form {start-boundary}_{end_boundary}. This must be specified together with -n/--namespace.|| +|`-n`, `--namespace`|The namespace|| + + +### `set` +Set the resource quota for the specified namespace bundle, or default quota if no namespace/bundle is specified. + +Usage +```bash +$ pulsar-admin resource-quotas set options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-bi`, `--bandwidthIn`|The expected inbound bandwidth (in bytes/second)|0| +|`-bo`, `--bandwidthOut`|Expected outbound bandwidth (in bytes/second)0| +|`-b`, `--bundle`|A bundle of the form {start-boundary}_{end_boundary}. This must be specified together with -n/--namespace.|| +|`-d`, `--dynamic`|Allow to be dynamically re-calculated (or not)|false| +|`-mem`, `--memory`|Expectred memory usage (in megabytes)|0| +|`-mi`, `--msgRateIn`|Expected incoming messages per second|0| +|`-mo`, `--msgRateOut`|Expected outgoing messages per second|0| +|`-n`, `--namespace`|The namespace as tenant/namespace, for example my-tenant/my-ns. Must be specified together with -b/--bundle.|| + + +### `reset-namespace-bundle-quota` +Reset the specifed namespace bundle's resource quota to a default value. + +Usage +```bash +$ pulsar-admin resource-quotas reset-namespace-bundle-quota options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|A bundle of the form {start-boundary}_{end_boundary}. This must be specified together with -n/--namespace.|| +|`-n`, `--namespace`|The namespace|| + + + +## `schemas` +Operations related to Schemas associated with Pulsar topics. + +Usage +``` +$ pulsar-admin schemas subcommand +``` + +Subcommands +* `upload` +* `delete` +* `get` + + +### `upload` +Upload the schema definition for a topic + +Usage +```bash +$ pulsar-admin schemas upload persistent://tenant/namespace/topic options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`--filename`|The path to the schema definition file. An example schema file is available under conf directory.|| + + +### `delete` +Delete the schema definition associated with a topic + +Usage +```bash +$ pulsar-admin schemas delete persistent://tenant/namespace/topic +``` + + +### `get` +Retrieve the schema definition assoicated with a topic (at a given version if version is supplied). + +Usage +```bash +$ pulsar-admin schemas get persistent://tenant/namespace/topic options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`--version`|The version of the schema definition to retrive for a topic.|| + + diff --git a/site2/docs/reference-rest-api.md b/site2/docs/reference-rest-api.md new file mode 100644 index 0000000000000000000000000000000000000000..0c89ec4968cb3fdde1750199086852b88ee8dafc --- /dev/null +++ b/site2/docs/reference-rest-api.md @@ -0,0 +1,8 @@ +--- +id: reference-rest-api +title: Pulsar Admin Rest API +sidebar_label: REST API +--- + +# [Pulsar Admin REST API](/staging/admin-rest-api) + diff --git a/site2/docs/security-athenz.md b/site2/docs/security-athenz.md new file mode 100644 index 0000000000000000000000000000000000000000..c349cae1bb915c697129b676bb6ec244bd6d4968 --- /dev/null +++ b/site2/docs/security-athenz.md @@ -0,0 +1,84 @@ +--- +id: security-athenz +title: Authentication using Athenz +sidebar_label: Authentication using Athenz +--- + +[Athenz](https://github.com/yahoo/athenz) is a role-based authentication/authorization system. In Pulsar, Athenz role tokens (aka *z-tokens*) can be used to establish the identify of the client. + +## Athenz authentication settings + +In a [decentralized Athenz system](https://github.com/yahoo/athenz/blob/master/docs/dev_decentralized_access.md) there is both an [authori**Z**ation **M**anagement **S**ystem](https://github.com/yahoo/athenz/blob/master/docs/setup_zms.md) (ZMS) server and an [authori**Z**ation **T**oken **S**ystem](https://github.com/yahoo/athenz/blob/master/docs/setup_zts.md) (ZTS) server. + +To begin, you need to set up Athenz service access control. You should create domains for the *provider* (which provides some resources to other services with some authentication/authorization policies) and the *tenant* (which is provisioned to access some resources in a provider). In this case, the provider corresponds to the Pulsar service itself and the tenant corresponds to each application using Pulsar (typically, a {% popover property %} in Pulsar). + +### Create the tenant domain and service + +On the {% popover tenant %} side, you need to: + +1. Create a domain, such as `shopping` +2. Generate a private/public key pair +3. Create a service, such as `some_app`, on the domain with the public key + +Note that the private key generated in step 2 needs to be specified when the Pulsar client connects to the {% popover broker %} (see client configuration examples for [Java](client-libraries-java.md#tls-authentication) and [C++](client-libraries-cpp.md#tls-authentication)). + +For more specific steps involving the Athenz UI, please refer to [this doc](https://github.com/yahoo/athenz/blob/master/docs/example_service_athenz_setup.md#client-tenant-domain). + +### Create the provider domain and add the tenant service to some role members + +On the provider side, you need to: + +1. Create a domain, such as `pulsar` +2. Create a role +3. Add the tenant service to members of the role + +Note that in step 2 any action and resource can be specified since they are not used on Pulsar. In other words, Pulsar uses the Athenz role token only for authentication, *not* for authorization. + +For more specific steps involving UI, please refer to [this doc](https://github.com/yahoo/athenz/blob/master/docs/example_service_athenz_setup.md#server-provider-domain). + +## Configure the broker for Athenz + +{% include message.html id="tls_role_tokens" %} + +In the `conf/broker.conf` configuration file in your Pulsar installation, you need to provide the class name of the Athenz authentication provider as well as a comma-separated list of provider domain names. + +```properties +# Add the Athenz auth provider +authenticationEnabled=true +authorizationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderAthenz +athenzDomainNames=pulsar + +# Enable TLS +tlsEnabled=true +tlsCertificateFilePath=/path/to/broker-cert.pem +tlsKeyFilePath=/path/to/broker-key.pem +``` + +{% include message.html id="broker_conf_doc" %} + +## Configure clients for Athenz + +For more information on Pulsar client authentication using Athenz, see the following language-specific docs: + +* [Java client](client-libraries-java.md#athenz) + +## Configure CLI tools for Athenz + +[Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-pulsar-admin.md), [`pulsar-perf`](reference-cli-tools.md#pulsar-perf), and [`pulsar-client`](reference-cli-tools.md#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + +You’ll need to add the following authentication parameters to that file to use Athenz with Pulsar’s CLI tools: + +```properties +# URL for the broker +serviceUrl=https://broker.example.com:8443/ + +# Set Athenz auth plugin and its parameters +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationAthenz +authParams={"tenantDomain":"shopping","tenantService":"some_app","providerDomain":"pulsar","privateKey":"file:///path/to/private.pem","keyId":"v1"} + +# Enable TLS +useTls=true +tlsAllowInsecureConnection=false +tlsTrustCertsFilePath=/path/to/cacert.pem +``` diff --git a/site2/docs/security-authorization.md b/site2/docs/security-authorization.md new file mode 100644 index 0000000000000000000000000000000000000000..4b879a08d0f48e9556e69f5106c2dd303e172dbb --- /dev/null +++ b/site2/docs/security-authorization.md @@ -0,0 +1,89 @@ +--- +id: security-authorization +title: Authentication and authorization in Pulsar +sidebar_label: Authorization and ACLs +--- + +n Pulsar, the [authentication provider](security-overview.md#authentication-providers) is charged with properly identifying clients and +associating them with [role tokens](security-overview.md#role-tokens). *Authorization* is the process that determines *what* clients are able to do. + +Authorization in Pulsar is managed at the {% popover tenant %} level, which means that you can have multiple authorization schemes active +in a single Pulsar instance. You could, for example, create a `shopping` tenant that has one set of [roles](security-overview.md#role-tokens) +and applies to a shopping application used by your company, while an `inventory` tenant would be used only by an inventory application. + +{% include message.html id="properties_multiple_clusters" %} + +## Creating a new tenant + +A Pulsar {% popover tenant %} is typically provisioned by Pulsar {% popover instance %} administrators or by some kind of self-service portal. + +Tenants are managed using the [`pulsar-admin`](reference-pulsar-admin.md) tool. Here's an example tenant creation command: + +```shell +$ bin/pulsar-admin tenants create my-tenant \ + --admin-roles my-admin-role \ + --allowed-clusters us-west,us-east +``` + +This command will create a new tenant `my-tenant` that will be allowed to use the clusters `us-west` and `us-east`. + +A client that successfully identified itself as having the role `my-admin-role` would then be allowed to perform all administrative tasks on this property. + +The structure of topic names in Pulsar reflects the hierarchy between tenants, clusters, and namespaces: + +{% include topic.html ten="tenant" n="namespace" t="topic" %} + +## Managing permissions + +{% include explanations/permissions.md %} + +## Superusers + +In Pulsar you can assign certain roles to be *superusers* of the system. A superuser is allowed to perform all administrative tasks on all tenants and namespaces, as well as to publish and subscribe to all topics. + +Superusers are configured in the broker configuration file in [`conf/broker.conf`](reference-configuration.md#broker) configuration file, using the [`superUserRoles`](reference-configuration.md#broker-superUserRoles) parameter: + +```tenants +superUserRoles=my-super-user-1,my-super-user-2 +``` + +{% include message.html id="broker_conf_doc" %} + +Typically, superuser roles are used for administrators and clients but also for broker-to-broker authorization. When using [geo-replication](administration-geo.md), every broker +needs to be able to publish to other clusters' topics. + +## Pulsar admin authentication + +```java +String authPluginClassName = "com.org.MyAuthPluginClass"; +String authParams = "param1:value1"; +boolean useTls = false; +boolean tlsAllowInsecureConnection = false; +String tlsTrustCertsFilePath = null; + +ClientConfiguration config = new ClientConfiguration(); +config.setAuthentication(authPluginClassName, authParams); +config.setUseTls(useTls); +config.setTlsAllowInsecureConnection(tlsAllowInsecureConnection); +config.setTlsTrustCertsFilePath(tlsTrustCertsFilePath); + +PulsarAdmin admin = new PulsarAdmin(url, config); +``` + +To use TLS: + +```java +String authPluginClassName = "com.org.MyAuthPluginClass"; +String authParams = "param1:value1"; +boolean useTls = false; +boolean tlsAllowInsecureConnection = false; +String tlsTrustCertsFilePath = null; + +ClientConfiguration config = new ClientConfiguration(); +config.setAuthentication(authPluginClassName, authParams); +config.setUseTls(useTls); +config.setTlsAllowInsecureConnection(tlsAllowInsecureConnection); +config.setTlsTrustCertsFilePath(tlsTrustCertsFilePath); + +PulsarAdmin admin = new PulsarAdmin(url, config); +``` diff --git a/site2/docs/security-encryption.md b/site2/docs/security-encryption.md new file mode 100644 index 0000000000000000000000000000000000000000..b2b3b14346694b7674268d5ea445b356d95a28ca --- /dev/null +++ b/site2/docs/security-encryption.md @@ -0,0 +1,169 @@ +--- +id: security-encryption +title: Pulsar Encryption +sidebar_label: End-to-End Encryption +--- + +Pulsar encryption allows applications to encrypt messages at the producer and decrypt at the consumer. Encryption is performed using the public/private key pair configured by the application. Encrypted messages can only be decrypted by consumers with a valid key. + +## Asymmetric and symmetric encryption + +Pulsar uses dynamically generated symmetric AES key to encrypt messages(data). The AES key(data key) is encrypted using application provided ECDSA/RSA key pair, as a result there is no need to share the secret with everyone. + +Key is a public/private key pair used for encryption/decryption. The producer key is the public key, and the consumer key is the private key of the key pair. + +The application configures the producer with the public key. This key is used to encrypt the AES data key. The encrypted data key is sent as part of message header. Only entities with the private key(in this case the consumer) will be able to decrypt the data key which is used to decrypt the message. + +A message can be encrypted with more than one key. Any one of the keys used for encrypting the message is sufficient to decrypt the message + +Pulsar does not store the encryption key anywhere in the pulsar service. If you lose/delete the private key, your message is irretrievably lost, and is unrecoverable + +## Producer +![alt text](/docs/assets/pulsar-encryption-producer.jpg "Pulsar Encryption Producer") + +## Consumer +![alt text](/docs/assets/pulsar-encryption-consumer.jpg "Pulsar Encryption Consumer") + +## Here are the steps to get started: + +1. Create your ECDSA or RSA public/private key pair. + +```shell +openssl ecparam -name secp521r1 -genkey -param_enc explicit -out test_ecdsa_privkey.pem +openssl ec -in test_ecdsa_privkey.pem -pubout -outform pkcs8 -out test_ecdsa_pubkey.pem +``` +2. Add the public and private key to the key management and configure your producers to retrieve public keys and consumers clients to retrieve private keys. +3. Implement CryptoKeyReader::getPublicKey() interface from producer and CryptoKeyReader::getPrivateKey() interface from consumer, which will be invoked by Pulsar client to load the key. +4. Add encryption key to producer configuration: conf.addEncryptionKey("myapp.key") +5. Add CryptoKeyReader implementation to producer/consumer config: conf.setCryptoKeyReader(keyReader) +6. Sample producer application: +```java +class RawFileKeyReader implements CryptoKeyReader { + + String publicKeyFile = ""; + String privateKeyFile = ""; + + RawFileKeyReader(String pubKeyFile, String privKeyFile) { + publicKeyFile = pubKeyFile; + privateKeyFile = privKeyFile; + } + + @Override + public EncryptionKeyInfo getPublicKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(publicKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read public key from file " + publicKeyFile); + e.printStackTrace(); + } + return keyInfo; + } + + @Override + public EncryptionKeyInfo getPrivateKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(privateKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read private key from file " + privateKeyFile); + e.printStackTrace(); + } + return keyInfo; + } +} +PulsarClient pulsarClient = PulsarClient.create("http://localhost:8080"); + +ProducerConfiguration prodConf = new ProducerConfiguration(); +prodConf.setCryptoKeyReader(new RawFileKeyReader("test_ecdsa_pubkey.pem", "test_ecdsa_privkey.pem")); +prodConf.addEncryptionKey("myappkey"); + +Producer producer = pulsarClient.createProducer("persistent://my-property/use/my-ns/my-topic", prodConf); + +for (int i = 0; i < 10; i++) { + producer.send("my-message".getBytes()); +} + +pulsarClient.close(); +``` +7. Sample Consumer Application: +```java +class RawFileKeyReader implements CryptoKeyReader { + + String publicKeyFile = ""; + String privateKeyFile = ""; + + RawFileKeyReader(String pubKeyFile, String privKeyFile) { + publicKeyFile = pubKeyFile; + privateKeyFile = privKeyFile; + } + + @Override + public EncryptionKeyInfo getPublicKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(publicKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read public key from file " + publicKeyFile); + e.printStackTrace(); + } + return keyInfo; + } + + @Override + public EncryptionKeyInfo getPrivateKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(privateKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read private key from file " + privateKeyFile); + e.printStackTrace(); + } + return keyInfo; + } +} + +ConsumerConfiguration consConf = new ConsumerConfiguration(); +consConf.setCryptoKeyReader(new RawFileKeyReader("test_ecdsa_pubkey.pem", "test_ecdsa_privkey.pem")); +PulsarClient pulsarClient = PulsarClient.create("http://localhost:8080"); +Consumer consumer = pulsarClient.subscribe("persistent://my-property/use/my-ns/my-topic", "my-subscriber-name", consConf); +Message msg = null; + +for (int i = 0; i < 10; i++) { + msg = consumer.receive(); + // do something + System.out.println("Received: " + new String(msg.getData())); +} + +// Acknowledge the consumption of all messages at once +consumer.acknowledgeCumulative(msg); +pulsarClient.close(); +``` + +## Key rotation +Pulsar generates new AES data key every 4 hours or after a certain number of messages are published. The asymmetric public key is automatically fetched by producer every 4 hours by calling CryptoKeyReader::getPublicKey() to retrieve the latest version. + +## Enabling encryption at the producer application: +If you produce messages that are consumed across application boundaries, you need to ensure that consumers in other applications have access to one of the private keys that can decrypt the messages. This can be done in two ways: +1. The consumer application provides you access to their public key, which you add to your producer keys +1. You grant access to one of the private keys from the pairs used by producer + +In some cases, the producer may want to encrypt the messages with multiple keys. For this, add all such keys to the config. Consumer will be able to decrypt the message, as long as it has access to at least one of the keys. + +E.g: If messages needs to be encrypted using 2 keys myapp.messagekey1 and myapp.messagekey2, +```java +conf.addEncryptionKey("myapp.messagekey1"); +conf.addEncryptionKey("myapp.messagekey2"); +``` +## Decrypting encrypted messages at the consumer application: +Consumers require access one of the private keys to decrypt messages produced by the producer. If you would like to receive encrypted messages, create a public/private key and give your public key to the producer application to encrypt messages using your public key. + +## Handling Failures: +* Producer/ Consumer loses access to the key + * Producer action will fail indicating the cause of the failure. Application has the option to proceed with sending unencrypted message in such cases. Call conf.setCryptoFailureAction(ProducerCryptoFailureAction) to control the producer behavior. The default behavior is to fail the request. + * If consumption failed due to decryption failure or missing keys in consumer, application has the option to consume the encrypted message or discard it. Call conf.setCryptoFailureAction(ConsumerCryptoFailureAction) to control the consumer behavior. The default behavior is to fail the request. +Application will never be able to decrypt the messages if the private key is permanently lost. +* Batch messaging + * If decryption fails and the message contain batch messages, client will not be able to retrieve individual messages in the batch, hence message consumption fails even if conf.setCryptoFailureAction() is set to CONSUME. +* If decryption fails, the message consumption stops and application will notice backlog growth in addition to decryption failure messages in the client log. If application does not have access to the private key to decrypt the message, the only option is to skip/discard backlogged messages. + diff --git a/site2/docs/security-extending.md b/site2/docs/security-extending.md new file mode 100644 index 0000000000000000000000000000000000000000..436f240722751d813f000d18e17f77cbc21443e3 --- /dev/null +++ b/site2/docs/security-extending.md @@ -0,0 +1,206 @@ +--- +id: security-extending +title: Extending Authentication and Authorization in Pulsar +sidebar_label: Extending +--- + +Pulsar provides a way to use custom authentication and authorization mechanisms + +## Authentication + +Pulsar support mutual TLS and Athenz authentication plugins, and these can be used as described +in [Security](security-overview.md). + +It is possible to use a custom authentication mechanism by providing the implementation in the +form of two plugins one for the Client library and the other for the Pulsar Broker to validate +the credentials. + +### Client authentication plugin + +For client library, you will need to implement `org.apache.pulsar.client.api.Authentication`. This class can then be passed +when creating a Pulsar client: + +```java +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .authentication(new MyAuthentication()) + .build(); +``` + +For reference, there are 2 interfaces to implement on the client side: + * `Authentication` -> http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/Authentication.html + * `AuthenticationDataProvider` -> http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/AuthenticationDataProvider.html + + +This in turn will need to provide the client credentials in the form of `org.apache.pulsar.client.api.AuthenticationDataProvider`. This will leave +the chance to return different kinds of authentication token for different +type of connection or by passing a certificate chain to use for TLS. + + +Examples for client authentication providers can be found at: + + * Mutual TLS Auth -- https://github.com/apache/incubator-pulsar/tree/master/pulsar-client/src/main/java/org/apache/pulsar/client/impl/auth + * Athenz -- https://github.com/apache/incubator-pulsar/tree/master/pulsar-client-auth-athenz/src/main/java/org/apache/pulsar/client/impl/auth + +### Broker authentication plugin + +On broker side, we need the corresponding plugin to validate the credentials +passed by the client. Broker can support multiple authentication providers +at the same time. + +In `conf/broker.conf` it's possible to specify a list of valid providers: + +```properties +# Autentication provider name list, which is comma separated list of class names +authenticationProviders= +``` + +There is one single interface to implement `org.apache.pulsar.broker.authentication.AuthenticationProvider`: + +```java +/** + * Provider of authentication mechanism + */ +public interface AuthenticationProvider extends Closeable { + + /** + * Perform initialization for the authentication provider + * + * @param config + * broker config object + * @throws IOException + * if the initialization fails + */ + void initialize(ServiceConfiguration config) throws IOException; + + /** + * @return the authentication method name supported by this provider + */ + String getAuthMethodName(); + + /** + * Validate the authentication for the given credentials with the specified authentication data + * + * @param authData + * provider specific authentication data + * @return the "role" string for the authenticated connection, if the authentication was successful + * @throws AuthenticationException + * if the credentials are not valid + */ + String authenticate(AuthenticationDataSource authData) throws AuthenticationException; + +} +``` + +Example for Broker authentication plugins: + + * Mutual TLS -- https://github.com/apache/incubator-pulsar/blob/master/pulsar-broker-common/src/main/java/org/apache/pulsar/broker/authentication/AuthenticationProviderTls.java + * Athenz -- https://github.com/apache/incubator-pulsar/blob/master/pulsar-broker-auth-athenz/src/main/java/org/apache/pulsar/broker/authentication/AuthenticationProviderAthenz.java + +## Authorization + +Authorization is the operation that checks whether a particular "role" or "principal" is +allowed to perform a certain operation. + +By default, Pulsar provides an embedded authorization, though it's possible to +configure a different one through a plugin. + +To provide a custom provider, one needs to implement the + `org.apache.pulsar.broker.authorization.AuthorizationProvider` interface, have this class in the + Pulsar broker classpath and configure it in `conf/broker.conf`: + + ```properties + # Authorization provider fully qualified class-name + authorizationProvider=org.apache.pulsar.broker.authorization.PulsarAuthorizationProvider + ``` + +```java +/** + * Provider of authorization mechanism + */ +public interface AuthorizationProvider extends Closeable { + + /** + * Perform initialization for the authorization provider + * + * @param config + * broker config object + * @param configCache + * pulsar zk configuration cache service + * @throws IOException + * if the initialization fails + */ + void initialize(ServiceConfiguration conf, ConfigurationCacheService configCache) throws IOException; + + /** + * Check if the specified role has permission to send messages to the specified fully qualified topic name. + * + * @param topicName + * the fully qualified topic name associated with the topic. + * @param role + * the app id used to send messages to the topic. + */ + CompletableFuture canProduceAsync(TopicName topicName, String role, + AuthenticationDataSource authenticationData); + + /** + * Check if the specified role has permission to receive messages from the specified fully qualified topic name. + * + * @param topicName + * the fully qualified topic name associated with the topic. + * @param role + * the app id used to receive messages from the topic. + * @param subscription + * the subscription name defined by the client + */ + CompletableFuture canConsumeAsync(TopicName topicName, String role, + AuthenticationDataSource authenticationData, String subscription); + + /** + * Check whether the specified role can perform a lookup for the specified topic. + * + * For that the caller needs to have producer or consumer permission. + * + * @param topicName + * @param role + * @return + * @throws Exception + */ + CompletableFuture canLookupAsync(TopicName topicName, String role, + AuthenticationDataSource authenticationData); + + /** + * + * Grant authorization-action permission on a namespace to the given client + * + * @param namespace + * @param actions + * @param role + * @param authDataJson + * additional authdata in json format + * @return CompletableFuture + * @completesWith
+ * IllegalArgumentException when namespace not found
+ * IllegalStateException when failed to grant permission + */ + CompletableFuture grantPermissionAsync(NamespaceName namespace, Set actions, String role, + String authDataJson); + + /** + * Grant authorization-action permission on a topic to the given client + * + * @param topicName + * @param role + * @param authDataJson + * additional authdata in json format + * @return CompletableFuture + * @completesWith
+ * IllegalArgumentException when namespace not found
+ * IllegalStateException when failed to grant permission + */ + CompletableFuture grantPermissionAsync(TopicName topicName, Set actions, String role, + String authDataJson); + +} + +``` diff --git a/site2/docs/security-overview.md b/site2/docs/security-overview.md new file mode 100644 index 0000000000000000000000000000000000000000..97c1ffb3aaa2c197f4efb09d295a1ac67f998b18 --- /dev/null +++ b/site2/docs/security-overview.md @@ -0,0 +1,39 @@ +--- +id: security-overview +title: Pulsar Security Overview +sidebar_label: Overview +--- + +Apache Pulsar is the central message bus for a business. It is frequently used to store mission-critical data, and therefore enabling security features are crucial. + +By default, there is no encryption, authentication, or authorization configured. Any client can communicate to Apache Pulsar via plain text service urls. +It is critical that access via these plain text service urls is restricted to trusted clients only. Network segmentation and/or authorization ACLs can be used +to restrict access to trusted IPs in such cases. If neither is used, the cluster is wide open and can be accessed by anyone. + +Pulsar supports a pluggable authentication mechanism that Pulsar clients can use to authenticate with brokers and proxies. Pulsar +can also be configured to support multiple authentication sources. + +It is strongly recommended to secure the service components in your Apache Pulsar deployment. + +## Role Tokens + +In Pulsar, a *role* is a string, like `admin` or `app1`, that can represent a single client or multiple clients. Roles are used to control permission for clients +to produce or consume from certain topics, administer the configuration for tenants, and more. + +Apache Pulsar uses a [Authentication Provider](#authentication-providers) to establish the identity of a client and then assign that client a *role token*. This +role token is then used for [Authorization and ACLs](security-authorization.md) to determine what the client is authorized to do. + +## Authentication Providers + +Currently Pulsar supports two authentication providers: + +- [TLS Authentication](security-tls.md) +- [Athenz](security-athenz.md) + +## Contents + +- [Encryption and Authentication using TLS](security-tls.md) +- [Authentication using Athenz](security-athenz.md) +- [Authorization and ACLs](security-authorization.md) +- [End-to-End Encryption](security-encryption.md) + diff --git a/site2/docs/security-tls.md b/site2/docs/security-tls.md new file mode 100644 index 0000000000000000000000000000000000000000..ed27c69e701e38d305cbf06001b487b1c14bd6c6 --- /dev/null +++ b/site2/docs/security-tls.md @@ -0,0 +1,141 @@ +--- +id: security-tls +title: Encryption and Authentication using TLS +sidebar_label: Encryption and Authentication using TLS +--- + +With [TLS](https://en.wikipedia.org/wiki/Transport_Layer_Security) authentication, the server authenticates the client (also called “2-way authentication”). +Since TLS authentication requires TLS encryption, this page shows you how to configure both at the same time. + +By default, Apache Pulsar communicates in plain text service url, which means that all data is sent in the clear. +To encrypt communication, it is recommended to configure all the Apache Pulsar components in your deployment to use TLS encryption. + +TLS can be configured for encryption or authentication. You may configure just TLS encryption +(by default TLS encryption includes certificate authentication of the server) and independently choose a separate mechanism +for client authentication, e.g. TLS, [Athenz](security-athenz.md), etc. Note that TLS encryption, technically speaking, already enables +1-way authentication in which the client authenticates the server certificate. So when referring to TLS authentication, it is really +referring to 2-way authentication in which the broker also authenticates the client certificate. + +> Note that enabling TLS may have a performance impact due to encryption overhead. + +## Creating TLS Certificates + +Creating TLS certificates for Pulsar involves creating a [certificate authority](#certificate-authority) (CA), [broker certificate](#broker-certificate), and [client certificate](#client-certificate). + +### Certificate authority + +The first step is to create the certificate for the CA. The CA will be used to sign both the broker and client certificates, in order to ensure that each party will trust the others. + +#### Linux + +```bash +$ CA.pl -newca +``` + +#### macOS + +```bash +$ /System/Library/OpenSSL/misc/CA.pl -newca +``` + +After answering the question prompts, this will store CA-related files in the `./demoCA` directory. Within that directory: + +* `demoCA/cacert.pem` is the public certificate. It is meant to be distributed to all parties involved. +* `demoCA/private/cakey.pem` is the private key. This is only needed when signing a new certificate for either broker or clients and it must be safely guarded. + +### Broker certificate + +Once a CA certificate has been created, you can create certificate requests and sign them with the CA. + +The following commands will ask you a few questions and then create the certificates. When asked for the common name, you need to match the hostname of the broker. You could also use a wildcard to match a group of broker hostnames, for example `*.broker.usw.example.com`. This ensures that the same certificate can be reused on multiple machines. + +```shell +$ openssl req \ + -newkey rsa:2048 \ + -sha256 \ + -nodes \ + -out broker-cert.csr \ + -outform PEM +``` + +Convert the key to [PKCS 8](https://en.wikipedia.org/wiki/PKCS_8) format: + +```shell +$ openssl pkcs8 \ + -topk8 \ + -inform PEM \ + -outform PEM \ + -in privkey.pem \ + -out broker-key.pem \ + -nocrypt +``` + +This will create two broker certificate files named `broker-cert.csr` and `broker-key.pem`. Now you can create the signed certificate: + +```shell +$ openssl ca \ + -out broker-cert.pem \ + -infiles broker-cert.csr +``` + +At this point, you should have a `broker-cert.pem` and `broker-key.pem` file. These will be needed for the broker. + +### Client certificate + +To create a client certificate, repeat the steps in the previous section, but did create `client-cert.pem` and `client-key.pem` files instead. + +For the client common name, you need to use a string that you intend to use as the [role token](security-overview.md#role-tokens) for this client, though it doesn't need to match the client hostname. + +## Configure the broker for TLS + +To configure a Pulsar {% popover broker %} to use TLS authentication, you'll need to make some changes to the `broker.conf` configuration file, which is located in the `conf` directory of your [Pulsar installation](getting-started-standalone.md). + +Add these values to the configuration file (substituting the appropriate certificate paths where necessary): + +```properties +# Enable TLS and point the broker to the right certs +tlsEnabled=true +tlsCertificateFilePath=/path/to/broker-cert.pem +tlsKeyFilePath=/path/to/broker-key.pem +tlsTrustCertsFilePath=/path/to/cacert.pem + +# Enable the TLS auth provider +authenticationEnabled=true +authorizationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderTls +``` + +{% include message.html id="broker_conf_doc" %} + +## Configure the discovery service + +The {% popover discovery %} service used by Pulsar brokers needs to redirect all HTTPS requests, which means that it needs to be trusted by the client as well. Add this configuration in `conf/discovery.conf` in your Pulsar installation: + +```properties +tlsEnabled=true +tlsCertificateFilePath=/path/to/broker-cert.pem +tlsKeyFilePath=/path/to/broker-key.pem +``` + +## Configure clients + +For more information on Pulsar client authentication using TLS, see the following language-specific docs: + +* [Java client](client-libraries-java.md) +* [C++ client](client-libraries-cpp.md) + +## Configure CLI tools + +[Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-pulsar-admin.md), [`pulsar-perf`](reference-cli-tools.md#pulsar-perf), and [`pulsar-client`](reference-cli-tools.md#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + +You'll need to add the following authentication parameters to that file to use TLS with Pulsar's CLI tools: + +```properties +serviceUrl=https://broker.example.com:8443/ +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationTls +authParams=tlsCertFile:/path/to/client-cert.pem,tlsKeyFile:/path/to/client-key.pem +useTls=true +tlsAllowInsecureConnection=false +tlsTrustCertsFilePath=/path/to/cacert.pem +``` + diff --git a/site2/tools/docker-build-site.sh b/site2/tools/docker-build-site.sh new file mode 100755 index 0000000000000000000000000000000000000000..68a75df6a23c1f25635f9c62ca73c7b672c54e4c --- /dev/null +++ b/site2/tools/docker-build-site.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Build Pulsar website within a Docker container + +# Fail script in case of errors +set -e + +ROOT_DIR=$(git rev-parse --show-toplevel) +cd $ROOT_DIR/pulsar-client-cpp + +BUILD_IMAGE_NAME="${BUILD_IMAGE_NAME:-apachepulsar/pulsar-build}" +BUILD_IMAGE_VERSION="${BUILD_IMAGE_VERSION:-ubuntu-16.04}" + +IMAGE="$BUILD_IMAGE_NAME:$BUILD_IMAGE_VERSION" + +echo "---- Build Pulsar website using image $IMAGE" + +#docker pull $IMAGE + +CI_USER=$(id -u) +CI_GROUP=$(id -g) + +DOCKER_CMD="docker run -i -e CI_USER=$CI_USER -e CI_GROUP=$CI_GROUP -v $ROOT_DIR:/pulsar $IMAGE" + +$DOCKER_CMD bash -l -c 'cd /pulsar/site2/website && yarn && yarn build && node ./scripts/replace.js && cp -R ./build/pulsar /pulsar/generated-site/content/staging' diff --git a/site2/website/__tests__/docs.test.js b/site2/website/__tests__/docs.test.js new file mode 100644 index 0000000000000000000000000000000000000000..5864ec680d7b0f3d86f39f5ffc57b56a366a3d22 --- /dev/null +++ b/site2/website/__tests__/docs.test.js @@ -0,0 +1,354 @@ +const puppeteer = require('puppeteer') +const axios = require('axios'); + + +const CWD = process.cwd(); +const siteConfig = require(`${CWD}/siteConfig.js`); + +const timeoutMs = 60000; + +let browser; +let navLinks = []; +let navGroups = []; + +function standaloneDocUrl() { + return `http://localhost:3000${siteConfig.baseUrl}docs/standalone` +} + +async function loadNavLinks() { + let page = await browser.newPage(); + await page.goto(standaloneDocUrl()); + const hrefs = await page.evaluate( + () => Array.from(document.body.querySelectorAll('a.navItem[href]'), ({ href }) => href) + ); + navLinks = hrefs; + const ng = await page.evaluate( + () => Array.from(document.body.querySelectorAll('.navGroups')) + ); + navGroups = ng; +} + +async function newPage() { + const page = await browser.newPage(); + await page.goto(standaloneDocUrl()); + return page +} + +expect.extend({ + toBeTruthyWithMessage(received, errMsg) { + const result = { + pass: received, + message: () => errMsg + }; + return result; + } +}); + +beforeAll(async () => { + browser = await puppeteer.launch({}) + await loadNavLinks() +}) + +async function docLinks(page) { + const hrefs = await page.evaluate(function() { + const main = document.body.querySelector('.mainContainer'); + const article = document.querySelector('article') + // https://github.com/GoogleChrome/puppeteer/issues/2479 + return Array.from(article.querySelectorAll('a[href]'), ({href}) => href.split('#')[0]); + }); + return new Set(hrefs) +} + + +async function findNavLinks(page, title) { + const links = await page.evaluate(function(title) { + const navGroups = Array.from(document.body.querySelectorAll('.navGroup')); + for (ng of navGroups) { + if (title === ng.children[0].innerText) { + return Array.from(ng.querySelectorAll('a[href]'), ({href}) => href) + } + } + + return [] + }, title) + + return links +} + +function ok(response) { + return response.status() === 200 || response.status() === 304 +} + +function shouldSkipLink(link) { + if (link.includes('mailto:')) { + return true + } + if (link.includes('/api/')) { + return true + } + if (link.includes('localhost:8001')) { + return true + } + if (link.includes('localhost:8080')) { + return true + } + if (link.includes('localhost:4000')) { + return true + } + if (link.includes('localhost:6650')) { + return true + } + if (link.includes('localhost:3000')) { + return true + } + if (link.includes('.dcos')) { + return true + } + if (link.includes('192.168')) { + return true + } + if (link.includes('org.apache.pulsar:')) { + return true + } + if (link.includes('websockets.')) { + return true + } + if (link.includes('logging.apache.org')) { + return true + } + if (link.includes('pulsar:')) { + return true + } + return false +} + +async function testDocLinks(page, links) { + const results = [] + + for (l of links) { + // console.log("checking doc links for", l) + await page.goto(l) + const doclinks = await docLinks(page) + const result = { + url: l, + broken: [] + } + + for (dl of doclinks) { + if (shouldSkipLink(dl)) { + continue + } + // console.log(" doc link", dl) + try { + const response = await axios.get(dl); + } catch (error) { + console.log(error); + result.broken.push(dl); + } + } + + if (result.broken.length > 0) { + results.push(result) + } + } + return results +} + +function assertResults(results) { + let errMsg = "" + for (r of results) { + let msg = `${r.url} contains broken links:\n` + for (bl of r.broken) { + msg += `\t${bl}\n` + } + errMsg += msg + } + + expect(results.length == 0).toBeTruthyWithMessage(errMsg) +} + + +test('Getting started', async() => { + const page = await newPage() + + const links = await findNavLinks(page, 'Getting started') + expect(links.length).toBeGreaterThan(0); + for (l of links) { + const response = await axios.get(l); + expect(response.status).toBe(200); + } + + const results = await testDocLinks(page, links); + + assertResults(results) +}, timeoutMs) + + +test('Pulsar Functions', async() => { + const page = await newPage() + + const links = await findNavLinks(page, 'Pulsar Functions') + expect(links.length).toBeGreaterThan(0); + for (l of links) { + const response = await axios.get(l); + expect(response.status).toBe(200); + } + + const results = await testDocLinks(page, links); + + assertResults(results) +}, timeoutMs) + + +test('Pulsar IO', async() => { + const page = await newPage() + + const links = await findNavLinks(page, 'Pulsar IO') + expect(links.length).toBeGreaterThan(0); + for (l of links) { + const response = await axios.get(l); + expect(response.status).toBe(200); + } + + const results = await testDocLinks(page, links); + assertResults(results); + +}, timeoutMs) + + + +test('Deployment', async() => { + const page = await newPage() + + const links = await findNavLinks(page, 'Deployment') + expect(links.length).toBeGreaterThan(0); + for (l of links) { + const response = await axios.get(l); + expect(response.status).toBe(200); + } + + const results = await testDocLinks(page, links); + assertResults(results); + +}, 180000) + + + +test('Pulsar administration', async() => { + const page = await newPage() + + const links = await findNavLinks(page, 'Pulsar administration') + expect(links.length).toBeGreaterThan(0); + for (l of links) { + const response = await axios.get(l); + expect(response.status).toBe(200); + } + + const results = await testDocLinks(page, links); + assertResults(results); + +}, timeoutMs) + + +test('Security', async() => { + const page = await newPage() + + const links = await findNavLinks(page, 'Security') + expect(links.length).toBeGreaterThan(0); + for (l of links) { + const response = await axios.get(l); + expect(response.status).toBe(200); + } + + const results = await testDocLinks(page, links); + assertResults(results); + +}, 180000) + + +test('Client libraries', async() => { + const page = await newPage() + + const links = await findNavLinks(page, 'Client libraries') + expect(links.length).toBeGreaterThan(0); + for (l of links) { + const response = await axios.get(l); + expect(response.status).toBe(200); + } + + const results = await testDocLinks(page, links); + assertResults(results); + +}, timeoutMs) + + + +test('Admin API', async() => { + const page = await newPage(); + + const links = await findNavLinks(page, 'Admin API'); + expect(links.length).toBeGreaterThan(0); + for (l of links) { + const response = await axios.get(l); + expect(response.status).toBe(200); + } + + const results = await testDocLinks(page, links); + assertResults(results); + +}, timeoutMs) + + +test('Adaptors', async() => { + const page = await newPage(); + + const links = await findNavLinks(page, 'Adaptors'); + expect(links.length).toBeGreaterThan(0); + for (l of links) { + const response = await axios.get(l); + expect(response.status).toBe(200); + } + + const results = await testDocLinks(page, links); + assertResults(results); + +}, timeoutMs) + + +test('Development', async() => { + const page = await newPage(); + + const links = await findNavLinks(page, 'Development'); + expect(links.length).toBeGreaterThan(0); + for (l of links) { + const response = await axios.get(l); + expect(response.status).toBe(200); + } + + const results = await testDocLinks(page, links); + assertResults(results); + +}, 180000) + + +test('Reference', async() => { + const page = await newPage() + + const links = await findNavLinks(page, 'Reference') + expect(links.length).toBeGreaterThan(0); + for (l of links) { + const response = await axios.get(l); + expect(response.status).toBe(200); + } + + const results = await testDocLinks(page, links); + assertResults(results); + +}, timeoutMs) + + +afterAll(() => { + if (browser) { + browser.close() + } +}) diff --git a/site2/website/__tests__/index.test.js b/site2/website/__tests__/index.test.js new file mode 100644 index 0000000000000000000000000000000000000000..b4254907d2e45f35cf6f9493c7e7bc69580595f8 --- /dev/null +++ b/site2/website/__tests__/index.test.js @@ -0,0 +1,18 @@ +const puppeteer = require('puppeteer') + +let browser; + +beforeAll(async () => { + browser = await puppeteer.launch({}) +}) + +afterAll(() => { + if (browser) { + browser.close() + } +}) + + +test('key feature links', async () => { + expect(3).toBe(3); +}); diff --git a/site2/website/core/Footer.js b/site2/website/core/Footer.js new file mode 100644 index 0000000000000000000000000000000000000000..622072b70b7a4a0ec619c03430f9d83181db81ad --- /dev/null +++ b/site2/website/core/Footer.js @@ -0,0 +1,187 @@ +/** + * Copyright (c) 2017-present, Facebook, Inc. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +const React = require('react'); + +/* +class Footer extends React.Component { + docUrl(doc, language) { + const baseUrl = this.props.config.baseUrl; + return baseUrl + 'docs/' + (language ? language + '/' : '') + doc; + } + + pageUrl(doc, language) { + const baseUrl = this.props.config.baseUrl; + return baseUrl + (language ? language + '/' : '') + doc; + } + + render() { + const currentYear = new Date().getFullYear(); + return ( + + ); + } +} +*/ + +class Footer extends React.Component { + docUrl(doc, language) { + const baseUrl = this.props.config.baseUrl; + return baseUrl + 'docs/' + (language ? language + '/' : '') + doc; + } + + pageUrl(doc, language) { + const baseUrl = this.props.config.baseUrl; + return baseUrl + (language ? language + '/' : '') + doc; + } + + render() { + const currentYear = new Date().getFullYear(); + + const contactUrl = this.pageUrl('contact') + const eventsUrl = this.pageUrl('events') + const twitterUrl = 'https://twitter.com/Apache_Pulsar' + const wikiUrl = 'https://github.com/apache/incubator-pulsar/wiki' + const issuesUrl = 'https://github.com/apache/incubator-pulsar/issues' + const resourcesUrl = this.pageUrl('resources') + const teamUrl = this.pageUrl('team') + + const communityMenuJs = ` + const community = document.querySelector("a[href='#community']").parentNode; + const communityMenu = + '
  • ' + + 'Community' + + '
    ' + + '' + + '
    ' + + '
  • '; + + community.innerHTML = communityMenu; + + const communityMenuItem = document.getElementById("community-menu"); + const communityDropDown = document.getElementById("community-dropdown"); + communityMenuItem.addEventListener("click", function(event) { + event.preventDefault(); + + if (communityDropDown.className == 'hide') { + communityDropDown.className = 'visible'; + } else { + communityDropDown.className = 'hide'; + } + }); + ` + + return ( +
    +
    {this.props.config.copyright}
    + + + +
    + ); + } +} + + +module.exports = Footer; diff --git a/site2/website/data/resources.js b/site2/website/data/resources.js new file mode 100644 index 0000000000000000000000000000000000000000..d0f04ac1d659f9b85b4f5f148252cb58535a7b92 --- /dev/null +++ b/site2/website/data/resources.js @@ -0,0 +1,54 @@ +module.exports = { + articles: [ + { + forum: 'Yahoo Engineering blog', + forum_link: 'https://yahooeng.tumblr.com/', + title: 'Open sourcing Pulsar, pub-sub messaging at scale', + link: 'https://yahooeng.tumblr.com/post/150078336821/open-sourcing-pulsar-pub-sub-messaging-at-scale' + }, + { + forum: 'Streamlio blog', + forum_link: 'https://streaml.io/blog', + title: 'Introduction to Apache Pulsar', + link: 'https://streaml.io/blog/intro-to-pulsar/' + }, + { + forum: 'Streamlio blog', + forum_link: 'https://streaml.io/blog', + title: 'Why Apache Pulsar? Part 1', + link: 'https://streaml.io/blog/why-apache-pulsar/' + }, + { + forum: 'Streamlio blog', + forum_link: 'https://streaml.io/blog', + title: 'Why Apache Pulsar? Part 2', + link: 'https://streaml.io/blog/why-apache-pulsar-part-2' + } + ], + presentations: [ + { + forum: 'Strata San Jose', + forum_link: 'https://conferences.oreilly.com/strata/strata-ca', + presenter: 'Matteo Merli (Pulsar co-creator)', + date: 'March 2018', + title: 'Effectively-once semantics in Apache Pulsar', + link: 'https://www.slideshare.net/merlimat/effectivelyonce-semantics-in-apache-pulsar' + }, + { + forum: '', + forum_link: '', + presenter: 'Matteo Merli (Pulsar co-creator)', + date: 'November 2016', + title: 'Pulsar: a distributed pub-sub platform', + link: 'https://www.slideshare.net/merlimat/pulsar-distributed-pubsub-platform' + }, + { + forum: 'Bay Area Hadoop Meetup', + forum_link: 'https://www.meetup.com/hadoop', + presenter: 'Matteo Merli (Pulsar co-creator)', + date: 'October 2016', + title: 'Pulsar: a highly scalable, low-latency pub-sub messaging system', + link: 'https://www.slideshare.net/ydn/october-2016-hug-pulsar-a-highly-scalable-low-latency-pubsub-messaging-system' + } + ] +} diff --git a/site2/website/data/team.js b/site2/website/data/team.js new file mode 100644 index 0000000000000000000000000000000000000000..6c0cc8127794a5e07d11822afb5bbde8faf49f57 --- /dev/null +++ b/site2/website/data/team.js @@ -0,0 +1,94 @@ +module.exports = { + committers: [ + { + name: 'Brad McMillen', + apacheId: 'bradtm', + org: 'Yahoo', + roles: 'Committer, PPMC', + }, + { + name: 'Hiroyuki Sakai', + apacheId: 'hrsakai', + org: 'Yahoo Japan Corporation', + roles: 'Committer, PPMC' + }, + { + name: 'Jai Asher', + apacheId: 'jai1', + org: 'Yahoo', + roles: 'Committer, PPMC' + }, + { + name: 'Joe Francis', + apacheId: 'joef', + org: 'Yahoo', + roles: 'Committer, PPMC' + }, + { + name: 'Ludwig Pummer', + apacheId: 'ludwigp', + org: 'Yahoo', + roles: 'Committer, PPMC' + }, + { + name: 'Masahiro Sakamoto', + apacheId: 'massakam', + org: 'Yahoo Japan Corporation', + roles: 'Committer, PPMC' + }, + { + name: 'Masakazu Kitajo', + apacheId: 'maskit', + org: '', + roles: 'Committer, PPMC' + }, + { + name: 'Matteo Merli', + apacheId: 'mmerli', + org: 'Streamlio', + roles: 'Committer, PPMC' + }, + { + name: 'Nozomi Kurihara', + apacheId: 'nkurihar', + org: 'Yahoo Japan Corporation', + roles: 'Committer, PPMC' + }, + { + name: 'Rajan Dhabalia', + apacheId: 'rdhabalia', + org: 'Yahoo', + roles: 'Committer, PPMC' + }, + { + name: 'Sahaya Andrews', + apacheId: 'andrews', + org: 'Yahoo', + roles: 'Committer, PPMC' + }, + { + name: 'Sebastián Schepens', + apacheId: 'sschepens', + org: 'MercadoLibre', + roles: 'Committer, PPMC' + }, + { + name: 'Siddharth Boobna', + apacheId: 'sboobna', + org: 'Salesforce', + roles: 'Committer, PPMC' + }, + { + name: 'Yuki Shiga', + apacheId: 'yushiga', + org: 'Yahoo Japan Corporation', + roles: 'Committer, PPMC' + } + ], + mentors: [ + {name: 'David Fisher', apacheId: 'wave'}, + {name: 'Francis Christopher Liu', apacheId: 'toffer'}, + {name: 'Jim Jagielski', apacheId: 'jim'}, + {name: 'P. Taylor Goetz', apacheId: 'ptgoetz' } + ] +} diff --git a/site2/website/jest-puppeteer.config.js b/site2/website/jest-puppeteer.config.js new file mode 100644 index 0000000000000000000000000000000000000000..75c87bf3b54d4c68c21f9776b5f53a2c6ec6117d --- /dev/null +++ b/site2/website/jest-puppeteer.config.js @@ -0,0 +1,10 @@ +module.exports = { + launch: { + headless: true, + }, + server: { + command: 'yarn start --no-watch', + port: 3000, + launchTimeout: 10000, + }, +} diff --git a/site2/website/package.json b/site2/website/package.json new file mode 100644 index 0000000000000000000000000000000000000000..b5013d40cd4eef7ab1a3f05df4cbf63333040a08 --- /dev/null +++ b/site2/website/package.json @@ -0,0 +1,24 @@ +{ + "scripts": { + "examples": "docusaurus-examples", + "start": "docusaurus-start", + "build": "docusaurus-build", + "publish-gh-pages": "docusaurus-publish", + "write-translations": "docusaurus-write-translations", + "version": "docusaurus-version", + "rename-version": "docusaurus-rename-version", + "test": "jest --detectOpenHandles" + }, + "devDependencies": { + "axios": "^0.18.0", + "docusaurus": "^1.3.2", + "jest": "^23.3.0", + "jest-puppeteer": "^3.2.1", + "puppeteer": "^1.5.0", + "remarkable-embed": "^0.4.1", + "replace-in-file": "^3.4.0" + }, + "jest": { + "preset": "jest-puppeteer" + } +} diff --git a/site2/website/pages/en/admin-rest-api.js b/site2/website/pages/en/admin-rest-api.js new file mode 100644 index 0000000000000000000000000000000000000000..cce71d45c0383519972b70d1b071f2580dc99aa4 --- /dev/null +++ b/site2/website/pages/en/admin-rest-api.js @@ -0,0 +1,44 @@ + +const React = require('react'); +const CompLibrary = require('../../core/CompLibrary.js'); + +const Container = CompLibrary.Container; +const siteConfig = require(`${process.cwd()}/siteConfig.js`); + +class AdminRestApi extends React.Component { + render() { + const swaggerUrl = `${siteConfig.baseUrl}swagger/swagger.json` + + const swagger = ` + const ui = SwaggerUIBundle({ + url: "${swaggerUrl}", + dom_id: '#swagger-ui', + presets: [ + SwaggerUIBundle.presets.apis + ], + filter: true, + //deepLinking: true, + //displayOperationId: true, + showCommonExtensions: true, + showExtensions: true, + //defaultModelRendering: "model", + defaultModelsExpandDepth: 0, + docExpansion: "list", + layout: "BaseLayout" + }) + ` + + return ( +
    + +
    + + + +
    + ); + } +} + +module.exports = AdminRestApi; diff --git a/site2/website/pages/en/contact.js b/site2/website/pages/en/contact.js new file mode 100644 index 0000000000000000000000000000000000000000..1bda91e19b64e116f1f4ea1a7ee773ff64c92a98 --- /dev/null +++ b/site2/website/pages/en/contact.js @@ -0,0 +1,118 @@ + +const React = require('react'); + +const CompLibrary = require('../../core/CompLibrary.js'); +const Container = CompLibrary.Container; +const MarkdownBlock = CompLibrary.MarkdownBlock; /* Used to read markdown */ +const GridBlock = CompLibrary.GridBlock; + +const CWD = process.cwd(); + +const siteConfig = require(`${CWD}/siteConfig.js`); + +function docUrl(doc, language) { + return siteConfig.baseUrl + 'docs/' + (language ? language + '/' : '') + doc; +} + +class Contact extends React.Component { + render() { + let language = this.props.language || ''; + const mailingLists = [ + { + email: 'users@pulsar.incubator.apache.org', + desc: 'User-related discussions', + subscribe: 'mailto:users@pulsar.incubator.apache.org', + unsubscribe: 'mailto:users-unsubscribe@pulsar.incubator.apache.org', + archives: 'http://mail-archives.apache.org/mod_mbox/incubator-pulsar-users/' + }, + { + email: 'dev@pulsar.incubator.apache.org', + desc: 'Development-related discussions', + subscribe: 'mailto:dev@pulsar.incubator.apache.org', + unsubscribe: 'mailto:dev-unsubscribe@pulsar.incubator.apache.org', + archives: 'http://mail-archives.apache.org/mod_mbox/incubator-pulsar-dev/' + }, + { + email: 'dev@pulsar.incubator.apache.org', + desc: 'All commits to the Pulsar repository', + subscribe: 'mailto:commits-subscribe@pulsar.incubator.apache.org', + unsubscribe: 'mailto:commits-unsubscribe@pulsar.incubator.apache.org', + archives: 'http://mail-archives.apache.org/mod_mbox/incubator-pulsar-commits/' + } + ] + + const supportLinks = [ + { + content: `Learn more using the [documentation on this site.](${docUrl( + 'doc1.html', + language + )})`, + title: 'Browse Docs', + }, + { + content: 'Ask questions about the documentation and project', + title: 'Join the community', + }, + { + content: "Find out what's new with this project", + title: 'Stay up to date', + }, + ]; + + return ( +
    + +
    +
    +

    Contact

    +
    +
    +

    + There are many ways to get help from the Apache Pulsar community. + The mailing lists are the primary place where all Pulsar committers are present. + Bugs and feature requests can either be discussed on the dev mailing list or + by opening an issue on GitHub. +

    + +

    Mailing Lists

    + + + + + + + + + + + + {mailingLists.map( + list => ( + + + + + + + + ) + )} + +
    NameScope
    {list.email}{list.desc}SubscribeUnsubscribeArchives
    + +

    Slack

    +

    There is a Pulsar slack channel that is used for informal discussions for devs and users.

    + + The Slack instance is at [https://apache-pulsar.slack.com/](https://apache-pulsar.slack.com/) + + + You can self-register at [https://apache-pulsar.herokuapp.com/](https://apache-pulsar.herokuapp.com/) + +
    +
    +
    + ); + } +} + +module.exports = Contact; diff --git a/site2/website/pages/en/download.js b/site2/website/pages/en/download.js new file mode 100644 index 0000000000000000000000000000000000000000..248487446b53900a0214d5d07787a50320a4ee24 --- /dev/null +++ b/site2/website/pages/en/download.js @@ -0,0 +1,177 @@ +const React = require('react'); + +const CompLibrary = require('../../core/CompLibrary'); +const MarkdownBlock = CompLibrary.MarkdownBlock; /* Used to read markdown */ +const Container = CompLibrary.Container; +const GridBlock = CompLibrary.GridBlock; + +const CWD = process.cwd(); + +const siteConfig = require(`${CWD}/siteConfig.js`); +const releases = require(`${CWD}/releases.json`); + +const archiveRootUrl = siteConfig.archiveRootUrl; + + +function archiveUrl(version, type) { + return `${archiveRootUrl}/pulsar-${version}/apache-pulsar-${version}-${type}.tar.gz` +} + +class Download extends React.Component { + render() { + const latestRelease = releases[0]; + + const latestVersion = `${latestRelease}-incubating` + const latestArchiveUrl = archiveUrl(latestVersion, 'bin'); + const latestSrcArchiveUrl = archiveUrl(latestVersion, 'src') + + const releaseInfo = releases.map(r => { + const version = `${r}-incubating`; + return { + version: version, + binArchiveUrl: archiveUrl(version, 'bin'), + srcArchiveUrl: archiveUrl(version, 'src') + } + }); + + return ( +
    + +
    +
    +

    Apache Pulsar downloads

    +
    +
    +

    Current version (Stable) {latestVersion}

    + + + + + + + + + + + + + + + + + + + + +
    ReleaseLinkCrypto files
    Binary + pulsar-{latestVersion}-bin.tar.gz + + asc, + sha1, + sha512 +
    Source + pulsar-{latestVersion}-src.tar.gz + + asc, + sha1, + sha512 +
    + +

    Release Integrity

    + + You must [verify](https://www.apache.org/info/verification.html) the integrity of the downloaded files. + We provide OpenPGP signatures for every release file. This signature should be matched against the + [KEYS](https://www.apache.org/dist/incubator/pulsar/KEYS) file which contains the OpenPGP keys of + Pulsar's Release Managers. We also provide `MD5` and `SHA-512` checksums for every release file. + After you download the file, you should calculate a checksum for your download, and make sure it is + the same as ours. + + + +

    Release notes

    + + [Release notes](/release-notes) for all Pulsar's versions + + +

    Getting started

    + + Once you've downloaded a Pulsar release, instructions on getting up and running with a standalone cluster + that you can run on your laptop can be found in the [Run Pulsar locally](/docs/standalone) tutorial. + +

    + If you need to connect to an existing Pulsar cluster or instance using an officially supported client, + see the client docs for these languages: +

    + + + + + + + + + + + + + + + + + + + + + + + + + +
    Client guideAPI docs
    The Pulsar java clientThe Pulsar java client
    The Pulsar go clientThe Pulsar go client
    The Pulsar python clientThe Pulsar python client
    The Pulsar C++ clientThe Pulsar C++ client
    + +

    Older releases

    + + + + + + + + + + + {releaseInfo.map( + info => + info.version !== latestVersion && ( + + + + + + + ) + )} + +
    ReleaseBinarySourceRelease notes
    {info.version} + pulsar-{info.version}-bin-tar.gz +   + (asc,  + sha1,  + sha512) + + pulsar-{info.version}-bin-tar.gz +   + (asc,  + sha1,  + sha512) + + Release Notes +
    +
    +
    +
    + ); + } +} + +module.exports = Download; diff --git a/site2/website/pages/en/events.js b/site2/website/pages/en/events.js new file mode 100644 index 0000000000000000000000000000000000000000..c4ef8b08b80ad3c02ab7cc4d8b68099e5a7196a8 --- /dev/null +++ b/site2/website/pages/en/events.js @@ -0,0 +1,48 @@ + +const React = require('react'); + +const CompLibrary = require('../../core/CompLibrary.js'); +const Container = CompLibrary.Container; +const MarkdownBlock = CompLibrary.MarkdownBlock; /* Used to read markdown */ +const GridBlock = CompLibrary.GridBlock; + +const CWD = process.cwd(); + +const siteConfig = require(`${CWD}/siteConfig.js`); + +const iframeSrc = "https://calendar.google.com/calendar/embed?showTitle=0&showPrint=0&showCalendars=0&mode=AGENDA&height=300&wkst=1&hl=en&bgcolor=%23FFFFFF&src=22kknj432ap5io49lvsjaac71o%40group.calendar.google.com&color=%232952A3&ctz=America%2FLos_Angeles"; + +class Events extends React.Component { + render() { + + return ( +
    + +
    +
    +

    Events

    +
    +
    +

    Calander

    + + + +

    Groups

    + + - [Apache Pulsar Bay Area Meetup Group](https://www.meetup.com/Apache-Pulsar-Meetup-Group/) + + + - [Japan Pulsar User Group](https://japan-pulsar-user-group.connpass.com/) + +
    +
    +
    + ); + } +} + +module.exports = Events; diff --git a/site2/website/pages/en/index.js b/site2/website/pages/en/index.js new file mode 100755 index 0000000000000000000000000000000000000000..0ad7d5866eb1e43d4a61a995a1247bc14b0af70d --- /dev/null +++ b/site2/website/pages/en/index.js @@ -0,0 +1,195 @@ + +const React = require('react'); + +const CompLibrary = require('../../core/CompLibrary.js'); +const MarkdownBlock = CompLibrary.MarkdownBlock; /* Used to read markdown */ +const Container = CompLibrary.Container; +const GridBlock = CompLibrary.GridBlock; + +const CWD = process.cwd(); + +const siteConfig = require(`${CWD}/siteConfig.js`); + +function imgUrl(img) { + return siteConfig.baseUrl + 'img/' + img; +} + +function docUrl(doc, language) { + return siteConfig.baseUrl + 'docs/' + (language ? language + '/' : '') + doc; +} + +function pageUrl(page, language) { + return siteConfig.baseUrl + (language ? language + '/' : '') + page; +} + +function githubUrl() { + return siteConfig.githubUrl; +} + + +class Button extends React.Component { + render() { + return ( + + ); + } +} + +Button.defaultProps = { + target: '_self', +}; + +const SplashContainer = props => ( +
    +
    +
    {props.children}
    +
    +
    +); + +const Logo = props => ( +
    + +
    +); + +const ProjectTitle = props => ( +

    + {siteConfig.projectDescription} +

    +); + +const PromoSection = props => ( +
    +
    +
    {props.children}
    +
    +
    +); + +class HomeSplash extends React.Component { + render() { + let language = this.props.language || ''; + return ( + + +
    + + + + + +
    +
    + ); + } +} + +const Block = props => ( + + + +); + + +const features = { + row1: [ + { + content: 'Easily deploy lightweight compute logic using developer-friendly APIs without needing to run your own stream processing engine', + title: `[Pulsar Functions](${docUrl('functions-overview')})`, + }, + { + content: 'Pulsar has run in production at Yahoo scale for over 3 years, with millions of messages per second across millions of topics', + title: `[Proven in production](${docUrl('concepts-architecture')})`, + }, + { + content: 'Seamlessly expand capacity to hundreds of nodes', + title: `[Horizontally scalable](${docUrl('concepts-architecture')})`, + } + ], + row2: [ + { + content: 'Designed for low publish latency (< 5ms) at scale with strong durabilty guarantees', + title: `[Low latency with durability](${docUrl('concepts-architecture')})`, + }, + { + content: 'Designed for configurable replication between data centers across multiple geographic regions', + title: `[Geo-replication](${docUrl('administration-geo')})`, + }, + { + content: 'Built from the ground up as a multi-tenant system. Supports Isolation, Authentication, Authorization and Quotas', + title: `[Multi-tenancy](${docUrl('concepts-architecture')})`, + } + ], + row3: [ + { + content: `Persistent message storage based on Apache BookKeeper. Provides IO-level isolation between write and read operations`, + title: `[Persistent storage](${docUrl('concepts-architecture')})`, + }, + { + content: 'Flexible messaging models with high-level APIs for Java, C++, Python and GO', + title: `[Client libraries](${docUrl('client-libraries')})`, + }, + { + content: 'REST Admin API for provisioning, administration, tools and monitoring. Deploy on bare metal or Kubernetes.', + title: `[Operability](${docUrl('admin-api-overview')})`, + } + ] +}; + +const KeyFeautresGrid = props => ( + + + + + +); + + + +const ApacheBlock = prop => ( + +
    +
    + +
    +

    + Apache Pulsar is an effort undergoing incubation at The Apache Software Foundation (ASF) + sponsored by the Apache Incubator PMC. Incubation is required of all newly accepted projects + until a further review indicates that the infrastructure, communications, and decision making + process have stabilized in a manner consistent with other successful ASF projects. + While incubation status is not necessarily a reflection of the completeness or stability of the code, + it does indicate that the project has yet to be fully endorsed by the ASF. + Apache Pulsar (incubating) is available under the Apache License, version 2.0. +

    +
    +
    +); + +class Index extends React.Component { + render() { + let language = this.props.language || ''; + + return ( +
    + +
    + + +
    +
    + ); + } +} + +module.exports = Index; diff --git a/site2/website/pages/en/release-notes.js b/site2/website/pages/en/release-notes.js new file mode 100644 index 0000000000000000000000000000000000000000..09643e37731494e97a23875c6505cfa8c1389ab0 --- /dev/null +++ b/site2/website/pages/en/release-notes.js @@ -0,0 +1,35 @@ +const React = require('react'); + +const CompLibrary = require('../../core/CompLibrary'); +const MarkdownBlock = CompLibrary.MarkdownBlock; /* Used to read markdown */ +const Container = CompLibrary.Container; +const GridBlock = CompLibrary.GridBlock; + +const CWD = process.cwd(); + +const siteConfig = require(`${CWD}/siteConfig.js`); + +const releaseNotes = require('fs').readFileSync(`${CWD}/release-notes.md`, 'utf8') + +class ReleaseNotes extends React.Component { + render() { + + return ( +
    + +
    +
    +

    Apache Pulsar downloads

    +
    +
    + + {releaseNotes} + +
    +
    +
    + ); + } +} + +module.exports = ReleaseNotes; diff --git a/site2/website/pages/en/resources.js b/site2/website/pages/en/resources.js new file mode 100644 index 0000000000000000000000000000000000000000..621d748253916a3658abec55d44c0a20b0d54a70 --- /dev/null +++ b/site2/website/pages/en/resources.js @@ -0,0 +1,78 @@ + + +const React = require('react'); + +const CompLibrary = require('../../core/CompLibrary.js'); +const Container = CompLibrary.Container; + +const CWD = process.cwd(); + +const siteConfig = require(`${CWD}/siteConfig.js`); +const resources = require(`${CWD}/data/resources.js`) + +class Resources extends React.Component { + render() { + let language = this.props.language || ''; + + + return ( +
    + +
    +
    +

    Resources

    +
    +
    + +

    Articles

    + + + + + + + + + {resources.articles.map( + (a, i) => ( + + + + + ) + )} + +
    ForumLink
    {a.forum}{a.title}
    + +

    Presentations

    + + + + + + + + + + + {resources.presentations.map( + (p, i) => ( + + + + + + + ) + )} + +
    ForumDataPresenterLink
    {p.forum}{p.date}{p.presenter}{p.title}
    + +
    +
    +
    + ); + } +} + +module.exports = Resources; diff --git a/site2/website/pages/en/team.js b/site2/website/pages/en/team.js new file mode 100644 index 0000000000000000000000000000000000000000..8f6a9b8fe3ca8a618a4ffd040aaf40dba84f2d34 --- /dev/null +++ b/site2/website/pages/en/team.js @@ -0,0 +1,96 @@ + + +const React = require('react'); + +const CompLibrary = require('../../core/CompLibrary.js'); +const Container = CompLibrary.Container; +const MarkdownBlock = CompLibrary.MarkdownBlock; /* Used to read markdown */ +const GridBlock = CompLibrary.GridBlock; + +const CWD = process.cwd(); + +const siteConfig = require(`${CWD}/siteConfig.js`); +const team = require(`${CWD}/data/team.js`) + +class Team extends React.Component { + render() { + let language = this.props.language || ''; + + + return ( +
    + +
    +
    +

    Contact

    +
    +
    +

    + A successful project requires many people to play many roles. + Some members write code or documentation, while others are valuable as testers, + submitting patches and suggestions. +

    +

    + The team is comprised of Members and Contributors. + Members have direct access to the source of a project and actively evolve the codebase. + Contributors improve the project through submission of patches and + suggestions to the Members. The number of Contributors to the project is unbounded. + Get involved today. All contributions to the project are greatly appreciated. +

    + +

    Committers

    +

    + The following is a list of developers with commit privileges that have directly + contributed to the project in one way or another. +

    + + + + + + + + + + + {team.committers.map( + c => ( + + + + + + + ) + )} + +
    NameApache IdOrganizationRoles
    {c.name}{c.apacheId}{c.org}{c.roles}
    + +

    Mentors

    +

    The following people are the mentors of this incubator project

    + + + + + + + + + {team.mentors.map( + m => ( + + + + + ) + )} + +
    NameApache Id
    {m.name}{m.apacheId}
    +
    +
    +
    + ); + } +} + +module.exports = Team; diff --git a/site2/website/pages/en/versions.js b/site2/website/pages/en/versions.js new file mode 100644 index 0000000000000000000000000000000000000000..5aabeb1824a50cf511761c52efe8123aab3c3e51 --- /dev/null +++ b/site2/website/pages/en/versions.js @@ -0,0 +1,97 @@ + +const React = require('react'); + +const CompLibrary = require('../../core/CompLibrary'); +const Container = CompLibrary.Container; +const GridBlock = CompLibrary.GridBlock; + +const CWD = process.cwd(); + +const siteConfig = require(`${CWD}/siteConfig.js`); +//const versions = require(CWD + '/versions.json'); + +/* +class Versions extends React.Component { + render() { + const latestVersion = versions[0]; + return ( +
    + +
    +
    +

    {siteConfig.title + ' Versions'}

    +
    +

    New versions of this project are released every so often.

    +

    Current version (Stable)

    + + + + + + + + +
    {latestVersion} + Documentation + + Release Notes +
    +

    + This is the version that is configured automatically when you + first install this project. +

    +

    Pre-release versions

    + + + + + + + + +
    master + Documentation + + Release Notes +
    +

    Other text describing this section.

    +

    Past Versions

    + + + {versions.map( + version => + version !== latestVersion && ( + + + + + + ) + )} + +
    {version} + Documentation + + Release Notes +
    +

    + You can find past versions of this project{' '} + on GitHub . +

    +
    +
    +
    + ); + } +} +*/ + +class Versions extends React.Component { + render() { + return ( +

    versions

    + ); + } +} + +module.exports = Versions; diff --git a/site2/website/release-notes.md b/site2/website/release-notes.md new file mode 100644 index 0000000000000000000000000000000000000000..8b5bd36d79cbc44aaeeb1e91754844bd02a1ea6a --- /dev/null +++ b/site2/website/release-notes.md @@ -0,0 +1,402 @@ + +## Apache incubator + + +### 2.0.1-incubating — 2018-06-18 + +This release fixes issues reported for 2.0.0-rc1-incubating. + + * [#1893](https://github.com/apache/incubator-pulsar/pull/1893) - Fixed issues with Python packages on PyPI + * [#1797](https://github.com/apache/incubator-pulsar/issues/1797) - Proxy doesn't strip the request + URL for admin requests correctly + * [#1862](https://github.com/apache/incubator-pulsar/pull/1862) - Fix REST APIs provided by Pulsar proxy + +The complete list of changes can be found at: +https://github.com/apache/incubator-pulsar/milestone/14?closed=1 + +https://github.com/apache/incubator-pulsar/releases/tag/v2.0.1-incubating + +### 1.22.1-incubating — 2018-06-18 + +This is the sixth release of Apache Pulsar since entering the ASF incubator. + +This release addresses issues reported in 1.22.0-incubating version. + + * [#1660](https://github.com/apache/incubator-pulsar/pull/1660) Deadlock while closing non persistent topic + * [#1591](https://github.com/apache/incubator-pulsar/pull/1591) Deadlock while closing non shared consumer + * [#1554](https://github.com/apache/incubator-pulsar/pull/1554) Handle invalid mark delete position at managed cursor + * [#1262](https://github.com/apache/incubator-pulsar/pull/1262) Broker should not start replicator for root partitioned topic + * [#1662](https://github.com/apache/incubator-pulsar/pull/1662) NPE when cursor failed to close empty subscription + * [#1370](https://github.com/apache/incubator-pulsar/pull/1370) Relocate service files for shading pulsar-client-admin module + * [#1265](https://github.com/apache/incubator-pulsar/pull/1265) Fixed lookup redirect logic on Proxyside + * [#1428](https://github.com/apache/incubator-pulsar/pull/1428) Handle Race condition in concurrent bundle split + * [#1817](https://github.com/apache/incubator-pulsar/pull/1817) Fixed mem leak when acknowledging while disconnected from broke + * [#1851](https://github.com/apache/incubator-pulsar/pull/1851) Fixing resource leak due to open file descriptors in SecurityUtility. + +The complete list of changes can be found at: +https://github.com/apache/incubator-pulsar/milestone/15?closed=1 + +https://github.com/apache/incubator-pulsar/releases/tag/v1.22.1-incubating + +### 2.0.0-rc1-incubating — 2018-05-29 + +This is the fifth release of Apache Pulsar since entering the ASF incubator and the +first time we increase major release number. + +There are several new features and major improvements: + + * [Pulsar functions](http://pulsar.apache.org/docs/latest/functions/overview/): Lightweight + compute framework + * New type-safe [Java API](http://pulsar.apache.org/docs/latest/clients/Java/) for producer/consumers + * [Schema registry](http://pulsar.apache.org/docs/v2.0.0-rc1-incubating/getting-started/ConceptsAndArchitecture/#Schemaregistry-ll008b) — Enforce schema on topics + * Topic compaction — Out of band compaction of messages to allow consumer to fetch a + snapshot with last published message for each message key. + * Upgraded to [Apache BookKeeper](https://bookkeeper.apache.org/) 4.7.0 + * Performance improvements — Up to 3x throughput improvements compared to Pulsar-1.22 and + 99.9 Pct publish latencies <10ms + * [Simplified terminology](http://pulsar.apache.org/docs/v2.0.0-rc1-incubating/getting-started/Pulsar-2.0/#Propertiesversustenants-gh1amh) and admin tools + - Renamed "property" into "tenants" + - Short topic names: `my-topic` + - Topics independent of cluster names: `my-tenant/my-namespace/my-topic` + +The complete list of changes can be found at: +https://github.com/apache/incubator-pulsar/milestone/12?closed=1 + +https://github.com/apache/incubator-pulsar/releases/tag/v2.0.0-rc1-incubating + + +### 1.22.0-incubating — 2018-03-06 + +This is the fourth of Apache Pulsar since entering the ASF incubator. + +Major changes in this release include: + +#### Features + * [#896](https://github.com/apache/incubator-pulsar/pull/896) PIP-7 Introduce Failure-domain and Anti-affinity-namespace group + * [#1031](https://github.com/apache/incubator-pulsar/pull/1031) Add optional key/value metadata to producers/consumers + * [#1129](https://github.com/apache/incubator-pulsar/pull/1129) Added end to end encryption in C++ client + * [#1151](https://github.com/apache/incubator-pulsar/pull/1151) Added REST handler to create a subscription on a topic + * [#1087](https://github.com/apache/incubator-pulsar/pull/1087) Add basic authentication plugin + * [#1200](https://github.com/apache/incubator-pulsar/pull/1200) Add pluggable authorization mechanism + * [#1208](https://github.com/apache/incubator-pulsar/pull/1208) Add hostname-verification at client tls connection + * [#950](https://github.com/apache/incubator-pulsar/pull/950) Provided an DCOS Universe package for pulsar + * [#1046](https://github.com/apache/incubator-pulsar/pull/1046) Introduce config to skip non-recoverable data-ledger + * [#899](https://github.com/apache/incubator-pulsar/pull/899) Add subscription auth mode by prefix + * [#1135](https://github.com/apache/incubator-pulsar/pull/1135) Added infinite time retention configuration option + +#### Enhancements + + * [#1094](https://github.com/apache/incubator-pulsar/pull/1094) Include BoringSSL native implementation for faster TLS + * [#1204](https://github.com/apache/incubator-pulsar/pull/1204) Reduce size of buffer used to assemble batches + * [#930](https://github.com/apache/incubator-pulsar/pull/930) Perform async DNS resolution + * [#1124](https://github.com/apache/incubator-pulsar/pull/1124) Support Pulsar proxy from C++/Python client library + * [#1012](https://github.com/apache/incubator-pulsar/pull/1012) Made load shedding for load manager Dynamically configurable + * [#962](https://github.com/apache/incubator-pulsar/pull/962) Raw Reader for Pulsar Topics + * [#941](https://github.com/apache/incubator-pulsar/pull/941) Upgraded Jackson version + * [#1002](https://github.com/apache/incubator-pulsar/pull/1002), [#1169](https://github.com/apache/incubator-pulsar/pull/1169), [#1168](https://github.com/apache/incubator-pulsar/pull/1168) Making Pulsar Proxy more secure + * [#1029](https://github.com/apache/incubator-pulsar/pull/1029) Fix MessageRouter hash inconsistent on C++/Java client + +#### Fixes + + * [#1153](https://github.com/apache/incubator-pulsar/pull/1153) Fixed increase partitions on a partitioned topic + * [#1195](https://github.com/apache/incubator-pulsar/pull/1195) Ensure the checksum is not stripped after validation in the broker + * [#1203](https://github.com/apache/incubator-pulsar/pull/1203) Use duplicates when writing from ByteBuf pair to avoid multiple threads issues + * [#1210](https://github.com/apache/incubator-pulsar/pull/1210) Cancel keep-alive timer task after the proxy switch to TCP proxy + * [#1170](https://github.com/apache/incubator-pulsar/pull/1170) Upgrade BK version: BK-4.3.1.91-yahoo (fix: stats + DoubleByteBuf) + * [#875](https://github.com/apache/incubator-pulsar/pull/875) Bug fixes for Websocket proxy + +The complete list of changes can be found at: +https://github.com/apache/incubator-pulsar/milestone/11?closed=1 + +https://github.com/apache/incubator-pulsar/releases/tag/v1.22.0-incubating + +### 1.21.0-incubating — 2017-12-17 + +This is the third of Apache Pulsar since entering the ASF incubator. + +Major changes in this release include: + + * [#689](https://github.com/apache/incubator-pulsar/pull/689) Upgrade to Netty 4.1 + * [#846](https://github.com/apache/incubator-pulsar/pull/846) Publish the shaded pulsar-client as the default dependency + * [#832](https://github.com/apache/incubator-pulsar/pull/832) [#833](https://github.com/apache/incubator-pulsar/pull/833) [#849](https://github.com/apache/incubator-pulsar/pull/849) [#852](https://github.com/apache/incubator-pulsar/pull/852) Enhancements to Kafka API wrapper to have it work with Kafka's own benchmark tools + * [#836](https://github.com/apache/incubator-pulsar/pull/836) Fix to C++ partitioned consumer client + * [#822](https://github.com/apache/incubator-pulsar/pull/822) [#826](https://github.com/apache/incubator-pulsar/pull/826) Several fixes and improvements related to the namespace bundles + * [#848](https://github.com/apache/incubator-pulsar/pull/848) Allow consumer to seek to message id from within Pulsar client + * [#903](https://github.com/apache/incubator-pulsar/pull/903) PIP-8: Scale Pulsar beyond 1M topics + * [#824](https://github.com/apache/incubator-pulsar/pull/824) Enable secure replication over TLS + * [#923](https://github.com/apache/incubator-pulsar/pull/923) Upgrade to bk-4.3.1.83-yahoo to expose journalSyncData option + * [#807](https://github.com/apache/incubator-pulsar/pull/807) Prevent message duplication when active consumer is changed + +Complete list of changes can be found at: +https://github.com/apache/incubator-pulsar/milestone/10?closed=1 + +https://github.com/apache/incubator-pulsar/releases/tag/v1.21.0-incubating + +### 1.20.0-incubating — 2017-08-08 + +This is the second of Apache Pulsar since entering the ASF incubator. + +Major changes in this release include: + + * [#620](https://github.com/apache/incubator-pulsar/pull/620) [#717](https://github.com/apache/incubator-pulsar/pull/717) [#718](https://github.com/apache/incubator-pulsar/pull/718) Reader API support for C++ , Python & Websocket Proxy + * [#634](https://github.com/apache/incubator-pulsar/pull/634) Added [Message dispatch throttling](https://github.com/apache/incubator-pulsar/wiki/PIP-3:-Message-dispatch-throttling) + * [#731](https://github.com/apache/incubator-pulsar/pull/731) Added [End to End Encryption](https://github.com/apache/incubator-pulsar/wiki/PIP-4:-Pulsar-End-to-End-Encryption) + * [#732](https://github.com/apache/incubator-pulsar/pull/732) Support for [Event Time](https://github.com/apache/incubator-pulsar/wiki/PIP-5:-Event-time) for messages + * [#751](https://github.com/apache/incubator-pulsar/pull/751) Guaranteed [Deduplication of Messages](https://github.com/apache/incubator-pulsar/wiki/PIP-6:-Guaranteed-Message-Deduplication) + * [#761](https://github.com/apache/incubator-pulsar/pull/761) Kafka API wrapper for Pulsar client library + +Complete list of changes can be found at: +https://github.com/apache/incubator-pulsar/milestone/9?closed=1 + +https://github.com/apache/incubator-pulsar/releases/tag/v1.20.0-incubating + +### 1.19.0-incubating — 2017-08-08 + +This is the first of Apache Pulsar since entering the ASF incubator. + +Major changes included in this release are: + + * [#524](https://github.com/apache/incubator-pulsar/pull/524) Moved APIs from `com.yahoo.pulsar` to `org.apache.pulsar` + * [#548](https://github.com/apache/incubator-pulsar/pull/548) Added stateless [Pulsar proxy](https://github.com/apache/incubator-pulsar/wiki/PIP-1:-Pulsar-Proxy) + * [#538](https://github.com/apache/incubator-pulsar/pull/538) Support for [non-persistent topics](https://github.com/apache/incubator-pulsar/wiki/PIP-2:-Non-Persistent-topic) + * [#587](https://github.com/apache/incubator-pulsar/pull/587) Upgraded RocksDB to comply with ASF policy + * [#507](https://github.com/apache/incubator-pulsar/pull/507) Instrumentation of ZooKeeper client to expose metrics + * Various fixes for TLS auth in WebSocket proxy + +Complete list of changes can be found at: +https://github.com/apache/incubator-pulsar/milestone/8?closed=1 + +https://github.com/apache/incubator-pulsar/releases/tag/v1.19.0-incubating + +## Pre-Apache + +### 1.18 — 2017-06-17 + +Main changes: + * [#325](https://github.com/apache/incubator-pulsar/pull/325) Add Modular load manager documentation + * [#329](https://github.com/apache/incubator-pulsar/pull/329) Add api to get list of partitioned topics + * [#296](https://github.com/apache/incubator-pulsar/pull/296) Added spark streaming custom receiver for pulsar + * [#317](https://github.com/apache/incubator-pulsar/pull/317) HTTP lookups for c++ client lib + * [#332](https://github.com/apache/incubator-pulsar/pull/332) Fix: Modular load manager bug fixes + * [#352](https://github.com/apache/incubator-pulsar/pull/352) Fix: Delete local-policies and invalidate cache when namespace is dele + * [#356](https://github.com/apache/incubator-pulsar/pull/356) Fix: WebSocket TLS connection bug + * [#363](https://github.com/apache/incubator-pulsar/pull/363) Use binary protocol lookup for connection between WebSocket proxy and broker + * [#375](https://github.com/apache/incubator-pulsar/pull/375) Fix: Bug fixes on deadlock while topic loading failure + * [#376](https://github.com/apache/incubator-pulsar/pull/376) Fix: Avoid incrementing unack-msg count for non-shared sub and not show it on stats + * [#329](https://github.com/apache/incubator-pulsar/pull/329) Fix: Handle zkCache failures + * [#387](https://github.com/apache/incubator-pulsar/pull/387) Pass client library version to broker and show on stats + * [#345](https://github.com/apache/incubator-pulsar/pull/345) Add load shedding strategy + * [#393](https://github.com/apache/incubator-pulsar/pull/393) Change default mark-delete rate limit from 10s to 1s + * [#392](https://github.com/apache/incubator-pulsar/pull/392) Upgrade to netty-4.0.46 + * [#366](https://github.com/apache/incubator-pulsar/pull/366) NonDurable cursor for managed ledger + * [#371](https://github.com/apache/incubator-pulsar/pull/371) Introduce topic reader in client API + * [#341](https://github.com/apache/incubator-pulsar/pull/341) Add stats and monitoring for websocket proxy + * [#299](https://github.com/apache/incubator-pulsar/pull/299) Add api to increase partitions of existing non-global partitioned-topic + * [#294](https://github.com/apache/incubator-pulsar/pull/294) Add endpoint to fetch stats for Prometheus + * [#440](https://github.com/apache/incubator-pulsar/pull/440) Enable PulsarAdmin to trust multiple certificates + * [#442](https://github.com/apache/incubator-pulsar/pull/442) Fix: Remove broker weights for ModularLoadManager + * [#446](https://github.com/apache/incubator-pulsar/pull/446) Fix: Recover cursor with correct readPosition and replay unackedMessages + * [#441](https://github.com/apache/incubator-pulsar/pull/441) Set Block If queue full to false by default + * [#447](https://github.com/apache/incubator-pulsar/pull/447) Fix: DoubleByteBuf to send large size messages in TLS mode + * [#443](https://github.com/apache/incubator-pulsar/pull/443) Add topic termination option + * [#436](https://github.com/apache/incubator-pulsar/pull/436) Added ZooKeeper instrumentation for enhanced stats + * [#448](https://github.com/apache/incubator-pulsar/pull/448) WebSocket proxy should not make a consumer/producer when authorization is failed + * [#443](https://github.com/apache/incubator-pulsar/pull/443) Add Docker images definition and instruction to deploy on Kubernetes + * [#474](https://github.com/apache/incubator-pulsar/pull/474) Fix: message rate out with batches to count messages/s + * [#482](https://github.com/apache/incubator-pulsar/pull/482) Allow client(producer/consumer) to check topic stats + * [#468](https://github.com/apache/incubator-pulsar/pull/468) Pulsar Python client library + * [#386](https://github.com/apache/incubator-pulsar/pull/386) Increment bookkeeper version to 4.3.1.69-yahoo + +Full list of changes: https://github.com/yahoo/pulsar/milestone/7?closed=1 + +https://github.com/apache/incubator-pulsar/releases/tag/v1.18 + +### 1.17.5 — 2017-05-02 + + * [#343](https://github.com/apache/incubator-pulsar/pull/343) Fix ModularLoadManager to select broker from current available-broker list + * [#384](https://github.com/apache/incubator-pulsar/pull/384) Fix Send replay entries read callback from background thread, to avoid recursive stack calls + * [#390](https://github.com/apache/incubator-pulsar/pull/390) Fix Shaded AsyncHttpClient in pulsar client + * [#374](https://github.com/apache/incubator-pulsar/pull/374) Fix Remove Exceptionally Completed Topic Futures + +https://github.com/apache/incubator-pulsar/releases/tag/v1.17.5 + +### 1.17.4 — 2017-04-25 + + * [#362](https://github.com/apache/incubator-pulsar/pull/362) Fix add timeout on blocking ZookeeperCache get call + * [#375](https://github.com/apache/incubator-pulsar/pull/375) Fix possible deal lock on topic loading if broker fails to get MLConfiguration from zk + * [#377](https://github.com/apache/incubator-pulsar/pull/377) Fix zkCache error handling and zk-callback processing on separate dedicated thread + +https://github.com/apache/incubator-pulsar/releases/tag/v1.17.4 + +### 1.17.3 — 2017-04-20 + + * [#367](https://github.com/apache/incubator-pulsar/pull/367) Fix dispatcher correctly finds available consumer from list of shared-subscription consumers + +https://github.com/apache/incubator-pulsar/releases/tag/v1.17.3 + +### 1.17.2 — 2017-04-06 + + * [#327](https://github.com/apache/incubator-pulsar/pull/327) Create znode for dynamic configuration if not present + * [#336](https://github.com/apache/incubator-pulsar/pull/336) Fix prevent creation of topic when bundle is disable + * [#338](https://github.com/apache/incubator-pulsar/pull/338) Fix deserialize load report based on load-manager + +https://github.com/apache/incubator-pulsar/releases/tag/v1.17.2 + +### 1.17.1 — 2017-03-30 + + * [#326](https://github.com/apache/incubator-pulsar/pull/326) Fix memory leak while duplicating entry data from existing entry + +https://github.com/apache/incubator-pulsar/releases/tag/v1.17.1 + +### 1.17 — 2017-03-30 + +Main changes: + + * [#188](https://github.com/apache/incubator-pulsar/pull/188) Pulsar Dashboard + * [#276](https://github.com/apache/incubator-pulsar/pull/276) Broker persist individually deleted messages + * [#282](https://github.com/apache/incubator-pulsar/pull/282) Support binary format to persist managed-ledger info in ZK + * [#292](https://github.com/apache/incubator-pulsar/pull/292) Added REST and CLI tool to expose ManagedLedger metadata + * [#285](https://github.com/apache/incubator-pulsar/pull/285) Add documentation in japanese + * [#178](https://github.com/apache/incubator-pulsar/pull/178) Add Athenz authentication plugin + * [#186](https://github.com/apache/incubator-pulsar/pull/186) Update Broker service configuration dynamically + * [#215](https://github.com/apache/incubator-pulsar/pull/215) Fix Broker disconnects unsupported batch-consumer on batch-message topic + * [#165](https://github.com/apache/incubator-pulsar/pull/165) Message dispatching on consumer priority-level + * [#303](https://github.com/apache/incubator-pulsar/pull/303) Introduce new load manager implementation + * [#306](https://github.com/apache/incubator-pulsar/pull/306) Add topic loading throttling at broker + * [#302](https://github.com/apache/incubator-pulsar/pull/302) Update BK version to 4.3.1.60-yahoo to include: 64bit ledger-ids, fix: memory leak on read-only bookie and datasketches concurrency issue + * [#216](https://github.com/apache/incubator-pulsar/pull/216) Binary proto api to get consumer stats + * [#225](https://github.com/apache/incubator-pulsar/pull/225) Server lookup throttling + * [#182](https://github.com/apache/incubator-pulsar/pull/182) Client lookup request throttling and server-error handling + * [#265](https://github.com/apache/incubator-pulsar/pull/265) Fix client handling on http server error + * [#204](https://github.com/apache/incubator-pulsar/pull/204) Fix discovery service redirection + * [#311](https://github.com/apache/incubator-pulsar/pull/311) Fix netty package conflict at binary distribution + * [#221](https://github.com/apache/incubator-pulsar/pull/221) Fixed race condition on client reconnection logic + * [#239](https://github.com/apache/incubator-pulsar/pull/239) Fix replicator handling on closed cursor + * [#318](https://github.com/apache/incubator-pulsar/pull/318) GC improvements: Recyclable entry and reduce collection on stats generation + +Full list of changes: https://github.com/apache/incubator-pulsar/milestone/3?closed=1 + +https://github.com/apache/incubator-pulsar/releases/tag/v1.17 + +### 1.16.5 — 2017-03-10 + + * [#311](https://github.com/apache/incubator-pulsar/pull/311) Exclude netty individual jars from binary distribution. This issue was causing binary distribution to have conflicting netty dependencies. + +https://github.com/apache/incubator-pulsar/releases/tag/v1.16.5 + +### 1.16.4 — 2017-03-10 + + * [#265](https://github.com/apache/incubator-pulsar/pull/265) Fix client closes http-connection on internal-server error + * [#283](https://github.com/apache/incubator-pulsar/pull/283) Fix recycle keep alive command-object properly + * [#284](https://github.com/apache/incubator-pulsar/pull/284) Reduce usage of collections in managed-ledger metrics-generation to reduce GC impact + +https://github.com/apache/incubator-pulsar/releases/tag/v1.16.4 + +### 1.16.3 — 2017-03-01 + + * [#275](https://github.com/apache/incubator-pulsar/pull/275) Fix for Infinite loop in PersistentReplicator.startProducer() + +https://github.com/apache/incubator-pulsar/releases/tag/v1.16.3 + +### 1.16.2 — 2017-02-24 + + * [#250](https://github.com/apache/incubator-pulsar/pull/250) : Disconnect consumers without closing dispatcher on cursor-reset + +https://github.com/apache/incubator-pulsar/releases/tag/v1.16.2 + +### 1.16.1 — 2017-02-24 + + * [#221](https://github.com/apache/incubator-pulsar/pull/221) Fixed race condition while creating client connection + * [#223](https://github.com/apache/incubator-pulsar/pull/223) Fixed broker's direct memory usage count + * [#220](https://github.com/apache/incubator-pulsar/pull/220) Fixed stuck replicator producer on backlog quota exception + * [#239](https://github.com/apache/incubator-pulsar/pull/239) Fixed replicator stop reading on already closed cursor + +https://github.com/apache/incubator-pulsar/releases/tag/v1.16.1 + +### 1.16 — 2017-02-02 + +Main changes: + * [#76](https://github.com/apache/incubator-pulsar/pull/76) Async Zookeeper cache implementation + * [#105](https://github.com/apache/incubator-pulsar/pull/105) Support topic lookup using pulsar binary protocol + * [#164](https://github.com/apache/incubator-pulsar/pull/164) Fixed handling failure of unloading namespace bundle + * [#166](https://github.com/apache/incubator-pulsar/pull/166) Support websocket proxy deployment without passing globalZK + * [#161](https://github.com/apache/incubator-pulsar/pull/161) Fixed avoiding creation of duplicate replicator + * [#160](https://github.com/apache/incubator-pulsar/pull/160) Add support uri encoding on broker admin rest api + * [#143](https://github.com/apache/incubator-pulsar/pull/143) Include DataSketches metrics provider for bookie stats + * [#127](https://github.com/apache/incubator-pulsar/pull/127) Updated BK-4.3.1.45/47-yahoo to include bookie/bookkeeper-client bug-fixes and DataSketch metrics provider + * [#124](https://github.com/apache/incubator-pulsar/pull/124) Consumer-stats: Add blockedConsumer flag + * [#95](https://github.com/apache/incubator-pulsar/pull/95) Consumer-stats: Add message redelivery rate + * [#123](https://github.com/apache/incubator-pulsar/pull/123) Fixed Batch message replication + * [#106](https://github.com/apache/incubator-pulsar/pull/106) Fixed Partitioned consumer should avoid blocking call to fill shared queue + * [#139](https://github.com/apache/incubator-pulsar/pull/139) Support online consumer cursor reset + * [#187](https://github.com/apache/incubator-pulsar/pull/187) Support custom advertised address in pulsar standalone + +Full list of changes: https://github.com/yahoo/pulsar/milestone/2?closed=1 + +https://github.com/apache/incubator-pulsar/releases/tag/v1.16 + +### 1.15.7 — 2017-01-25 + + * [#174](https://github.com/apache/incubator-pulsar/pull/174) Handling bundle unloading failure + +https://github.com/apache/incubator-pulsar/releases/tag/v1.15.7 + +### 1.15.6 — 2017-01-20 + + * [#171](https://github.com/apache/incubator-pulsar/pull/171) Fix: Consumer redelivery should not wipeout availablePermits + +https://github.com/apache/incubator-pulsar/releases/tag/v1.15.6 + +### 1.15.5 — 2017-01-03 + + * [#159](https://github.com/apache/incubator-pulsar/pull/159) Fix: Replicator-cleanup while closing replicator at broker. + * [#160](https://github.com/apache/incubator-pulsar/pull/160) Fix: Http lookup for topic with special character + +https://github.com/apache/incubator-pulsar/releases/tag/v1.15.5 + +### 1.15.4 — 2016-12-14 + + * [#146](https://github.com/apache/incubator-pulsar/pull/146) Fix: Partitioned consumer can consume messages with receiverQueueSize 1. + +https://github.com/apache/incubator-pulsar/releases/tag/v1.15.4 + +### 1.15.3 — 2016-12-13 + + * [#145](https://github.com/apache/incubator-pulsar/pull/145) Fixed issue Partitioned-consumer aggregate messages without blocking internal listener thread + +https://github.com/apache/incubator-pulsar/releases/tag/v1.15.3 + +### 1.15.2 — 2016-11-03 + + * [#102](https://github.com/apache/incubator-pulsar/pull/102) Fixed issue with message dispatching while message-replay at broker + +https://github.com/apache/incubator-pulsar/releases/tag/v1.15.2 + +### 1.15.1 — 2016-10-27 + + * [#89](https://github.com/apache/incubator-pulsar/pull/89) Fixed issue with replication in a mixed + environment with 1.14 and 1.15 brokers + +https://github.com/apache/incubator-pulsar/releases/tag/v1.15.1 + +### 1.15 — 2016-10-18 + +- [#39](https://github.com/apache/incubator-pulsar/pull/39) Updated BookKeeper version to 4.3.1.41-yahoo to include bookie storage improvements +- [#17](https://github.com/apache/incubator-pulsar/pull/17) Fixed memory leak in stats generation buffer rollover +- [#27](https://github.com/apache/incubator-pulsar/pull/27) Fixed issues with discovery service component when HTTPS is enabled +- [#43](https://github.com/apache/incubator-pulsar/pull/43) Add end-to-end crc32c checksum verification on message header and payload, rather than just payload. Support for intel hardware instructions to speed up computation. +- [#26](https://github.com/apache/incubator-pulsar/pull/26) Added ability to configure the address that the broker uses to advertise itself. Needed in cases where the public hostname/ip is different than the machine interface ip (eg: in AWS EC2 instances). +- [#38](https://github.com/apache/incubator-pulsar/pull/38) Prevent message-replay of already acknowledged messages +- [#51](https://github.com/apache/incubator-pulsar/pull/51) Per message unacknowledged redelivery. When ack-timeout is configured, only request redelivery of messages that effectively have the timeout expired, instead of all the messages dispatched to the consumer. +- [#48](https://github.com/apache/incubator-pulsar/pull/48) Add unacknowledged messages threshold to stop delivery to consumer that are not acknowledging messages +- [#59](https://github.com/apache/incubator-pulsar/pull/59) Added admin method to do a one-time messages time expiration for a given subscription (independently from the TTL configured at the namespace level) + +Full list of changes: https://github.com/apache/incubator-pulsar/milestone/1?closed=1 + +https://github.com/apache/incubator-pulsar/releases/tag/v1.15 + +### 1.14 — 2016-08-31 + +First public release of Pulsar + +https://github.com/apache/incubator-pulsar/releases/tag/v1.14 diff --git a/site2/website/releases.json b/site2/website/releases.json new file mode 100644 index 0000000000000000000000000000000000000000..6c69733c5cfb79121c6f224e275a54d9f43023ab --- /dev/null +++ b/site2/website/releases.json @@ -0,0 +1,8 @@ +[ + "2.0.1", + "1.22.1", + "1.22.0", + "1.21.0", + "1.20.0", + "1.19.0" +] \ No newline at end of file diff --git a/site2/website/scripts/replace.js b/site2/website/scripts/replace.js new file mode 100644 index 0000000000000000000000000000000000000000..0ec9264e3d6573bdc3b351f2a7ece417761a6128 --- /dev/null +++ b/site2/website/scripts/replace.js @@ -0,0 +1,99 @@ +const replace = require('replace-in-file'); + +const fs = require('fs') + +const CWD = process.cwd() +const siteConfig = require(`${CWD}/siteConfig.js`); +const docsDir = `${CWD}/build/${siteConfig.projectName}/docs` + + +function getVersions() { + try { + return JSON.parse(require('fs').readFileSync(`${CWD}/versions.json`, 'utf8')); + } catch (error) { + //console.error(error) + console.error('no versions found defaulting to 2.1.0') + } + return ['2.1.0'] +} + +function downloadPageUrl() { + return `${siteConfig.baseUrl}download` +} + +function pulsarRepoUrl() { + return siteConfig.githubUrl; +} + +function binaryReleaseUrl(version) { + return `http://www.apache.org/dyn/closer.cgi/incubator/pulsar/pulsar-${version}/apache-pulsar-${version}-bin.tar.gz` +} + + +function doReplace(options) { + replace(options) + .then(changes => { + if (options.dry) { + console.log('Modified files:'); + console.log(changes.join('\n')) + } + }) + .catch(error => { + console.error('Error occurred:', error); + }); +} + + +const versions = getVersions(); + +const latestVersion = versions[0]; + +const from = [ + /pulsar:version/g, + /pulsar:binary_release_url/g, + /pulsar:download_page_url/g, + /pulsar:repo_url/g +]; + + +const options = { + files: [ + `${docsDir}/*.html`, + `${docsDir}/**/*.html` + ], + ignore: versions.map(v => `${docsDir}/${v}/**/*`), // TODO add next and assets + from: from, + to: [ + `${latestVersion}-incubating`, + binaryReleaseUrl(`${latestVersion}-incubating`), + downloadPageUrl(), + pulsarRepoUrl() + ], + dry: false +}; + +doReplace(options); + +// TODO activate and test when first version of docs are cut +// replaces versions +for (v of versions) { + if (v === latestVersion) { + continue + } + const opts = { + files: [ + `${docsDir}/${v}/*.html`, + `${docsDir}/${v}/**/*.html` + ], + from: from, + to: [ + `${v}-incubating`, + binaryReleaseUrl(`${v}-incubating`), + downloadPageUrl(), + pulsarRepoUrl() + ], + dry: true + }; + doReplace(opts); +} + diff --git a/site2/website/scripts/test-server.js b/site2/website/scripts/test-server.js new file mode 100644 index 0000000000000000000000000000000000000000..2949c1101693049ff6a3194f4192e20abde33dfd --- /dev/null +++ b/site2/website/scripts/test-server.js @@ -0,0 +1,21 @@ + +const CWD = process.cwd() +const siteConfig = require(`${CWD}/siteConfig.js`); + +var finalhandler = require('finalhandler') +var http = require('http') +var serveStatic = require('serve-static') + +var options = { + index: ['index.html', 'index.htm'], + extensions: ['html'] +} +var serve = serveStatic(`./build/${siteConfig.projectName}`, options) + +// Create server +var server = http.createServer(function onRequest (req, res) { + serve(req, res, finalhandler(req, res)) +}) + +// Listen +server.listen(3000) diff --git a/site2/website/sidebars.json b/site2/website/sidebars.json new file mode 100644 index 0000000000000000000000000000000000000000..64588929afe14453baa9a9cda89cc654e40e4e37 --- /dev/null +++ b/site2/website/sidebars.json @@ -0,0 +1,97 @@ +{ + "docs": { + "Getting started": [ + "pulsar-2.0", + "standalone", + "standalone-docker", + "client-libraries", + "concepts-architecture" + ], + "Pulsar Functions": [ + "functions-overview", + "functions-quickstart", + "functions-api", + "functions-deploying", + "functions-guarantees", + "functions-metrics" + ], + "Pulsar IO": [ + "io-overview", + "io-quickstart" + ], + "Deployment": [ + "deploy-aws", + "deploy-kubernetes", + "deploy-bare-metal", + "deploy-bare-metal-multi-cluster", + "deploy-dcos", + "deploy-monitoring" + ], + "Pulsar administration": [ + "administration-zk-bk", + "administration-geo", + "administration-auth", + "administration-dashboard", + "administration-stats", + "administration-load-distribution", + "administration-proxy" + ], + "Security": [ + "security-overview", + "security-athenz", + "security-authorization", + "security-tls", + "security-encryption", + "security-extending" + ], + "Client libraries": [ + "client-libraries-java", + "client-libraries-go", + "client-libraries-python", + "client-libraries-cpp", + "client-libraries-websocket" + ], + "Admin API": [ + "admin-api-overview", + "admin-api-clusters", + "admin-api-tenants", + "admin-api-brokers", + "admin-api-namespaces", + "admin-api-permissions", + "admin-api-non-persistent-topics", + "admin-api-partitioned-topics" + ], + "Adaptors": [ + "adaptors-kafka", + "adaptors-spark", + "adaptors-storm" + ], + "Cookbooks": [ + "cookbooks-tiered-storage", + "cookbooks-compaction", + "cookbooks-deduplication", + "cookbooks-non-persistent", + "cookbooks-partitioned", + "cookbooks-retention-expiry", + "cookbooks-encryption", + "cookbooks-message-queue" + ], + "Development": [ + "develop-tools", + "develop-binary-protocol", + "develop-schema", + "develop-load-manager", + "develop-cpp" + ], + "Reference": [ + "reference-rest-api", + "reference-cli-tools", + "pulsar-admin", + "reference-configuration", + "reference-auth" + ] + }, + "docs-other": { + "First Category": ["doc4", "doc5"] + } +} diff --git a/site2/website/siteConfig.js b/site2/website/siteConfig.js new file mode 100644 index 0000000000000000000000000000000000000000..a996ebfb16fdd69202ca259792eedf83914d6abf --- /dev/null +++ b/site2/website/siteConfig.js @@ -0,0 +1,134 @@ + +const {Plugin: Embed} = require('remarkable-embed'); + +// Our custom remarkable plugin factory. +const createVariableInjectionPlugin = variables => { + // `let` binding used to initialize the `Embed` plugin only once for efficiency. + // See `if` statement below. + let initializedPlugin; + + const embed = new Embed(); + embed.register({ + // Call the render method to process the corresponding variable with + // the passed Remarkable instance. + // -> the Markdown markup in the variable will be converted to HTML. + inject: (key) => { + return initializedPlugin.render(variables[key]) + } + }); + + return (md, options) => { + if (!initializedPlugin) { + initializedPlugin = { + render: md.render.bind(md), + hook: embed.hook(md, options) + }; + } + + return initializedPlugin.hook; + }; +}; + + +const url = 'https://pulsar.incubator.apache.org'; +const githubUrl = 'https://github.com/apache/incubator-pulsar'; +const baseUrl = '/staging/'; + +const siteVariables = { +}; + + +const siteConfig = { + title: 'Apache Pulsar' /* title for your website */, + tagline: '', + url: url /* your website url */, + baseUrl: baseUrl /* base url for your project */, + // For github.io type URLs, you would set the url and baseUrl like: + // url: 'https://facebook.github.io', + // baseUrl: '/test-site/', + + editUrl: `${githubUrl}/edit/master/site2/docs/`, + + // Used for publishing and more + projectName: 'pulsar', + //organizationName: '', + // For top-level user or org sites, the organization is still the same. + // e.g., for the https://JoelMarcey.github.io site, it would be set like... + // organizationName: 'JoelMarcey' + + // For no header links in the top nav bar -> headerLinks: [], + headerLinks: [ + {doc: 'standalone', label: 'Documentation'}, + {page: 'download', label: 'Download'}, + {doc: 'client-libraries', label: 'Client libraries'}, + {href: '#community', label: 'Community'}, + {href: '#apache', label: 'Apache'}, + { search: true }, + // Determines language drop down position among links + // { languages: true } + ], + + // If you have users set above, you add it here: + users: [], + + /* path to images for header/footer */ + headerIcon: 'img/pulsar.svg', + footerIcon: 'img/pulsar.svg', + favicon: 'img/pulsar.ico', + + /* colors for website */ + colors: { + primaryColor: '#188fff', + secondaryColor: '#205C3B', + }, + // This copyright info is used in /core/Footer.js and blog rss/atom feeds. + copyright: + 'Copyright © ' + + new Date().getFullYear() + + ' The Apache Software Foundation. All Rights Reserved.' + + ' Apache, Apache Pulsar and the Apache feather logo are trademarks of The Apache Software Foundation.', + + highlight: { + // Highlight.js theme to use for syntax highlighting in code blocks + theme: 'atom-one-dark', + }, + + // Add custom scripts here that would be placed in