提交 1aa363ce 编写于 作者: F fjy

new quickstart

上级 9fde5924
......@@ -58,7 +58,7 @@
<classpath/>
<argument>-Ddruid.extensions.loadList=[]</argument>
<argument>-Ddruid.extensions.directory=${project.build.directory}/extensions</argument>
<argument>-Ddruid.extensions.hadoopDependenciesDir=${project.build.directory}/hadoop_dependencies</argument>
<argument>-Ddruid.extensions.hadoopDependenciesDir=${project.build.directory}/hadoop-dependencies</argument>
<argument>io.druid.cli.Main</argument>
<argument>tools</argument>
<argument>pull-deps</argument>
......
......@@ -34,85 +34,152 @@
</excludes>
<outputDirectory>extensions</outputDirectory>
</fileSet>
<fileSet>
<directory>${project.build.directory}/hadoop_dependencies</directory>
<directory>${project.build.directory}/hadoop-dependencies</directory>
<includes>
<include>*/*/*</include>
</includes>
<outputDirectory>hadoop_dependencies</outputDirectory>
<outputDirectory>hadoop-dependencies</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/config</directory>
<directory>../examples/quickstart/</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>config</outputDirectory>
<outputDirectory>quickstart</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/config/_common</directory>
<directory>../examples/conf-quickstart</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>config/_common</outputDirectory>
<outputDirectory>conf-quickstart</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/config/broker</directory>
<directory>../examples/conf-quickstart/druid</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>config/broker</outputDirectory>
<outputDirectory>conf-quickstart/druid</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/config/coordinator</directory>
<directory>../examples/conf-quickstart/druid/_common</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>config/coordinator</outputDirectory>
<outputDirectory>conf-quickstart/druid/_common/</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/config/realtime</directory>
<directory>../examples/conf-quickstart/druid/broker</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>config/realtime</outputDirectory>
<outputDirectory>conf-quickstart/druid/broker</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/config/historical</directory>
<directory>../examples/conf-quickstart/druid/coordinator</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>config/historical</outputDirectory>
<outputDirectory>conf-quickstart/druid/coordinator</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/config/overlord</directory>
<directory>../examples/conf-quickstart/druid/historical</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>config/overlord</outputDirectory>
<outputDirectory>conf-quickstart/druid/historical</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/bin</directory>
<directory>../examples/conf-quickstart/druid/overlord</directory>
<includes>
<include>*sh</include>
<include>*</include>
</includes>
<fileMode>744</fileMode>
<outputDirectory>/</outputDirectory>
<outputDirectory>conf-quickstart/druid/overlord</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/conf-quickstart/druid/middleManager</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>conf-quickstart/druid/middleManager</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/conf-quickstart/tranquility</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>conf-quickstart/tranquility</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/conf</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>conf</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/conf/druid/_common</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>conf/druid/_common</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/conf/druid/broker</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>conf/druid/broker</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/conf/druid/coordinator</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>conf/druid/coordinator</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/conf/druid/historical</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>conf/druid/historical</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/bin/examples</directory>
<directory>../examples/conf/druid/overlord</directory>
<includes>
<include>**</include>
<include>*</include>
</includes>
<outputDirectory>conf/druid/overlord</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/conf/druid/middleManager</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>conf/druid/middleManager</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/conf/tranquility</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>examples</outputDirectory>
<outputDirectory>conf/tranquility</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/bin/examples/twitter</directory>
<directory>../examples/bin</directory>
<includes>
<include>*sh</include>
<include>*</include>
</includes>
<fileMode>744</fileMode>
<outputDirectory>examples/twitter</outputDirectory>
<outputDirectory>bin</outputDirectory>
</fileSet>
<fileSet>
<directory>../</directory>
<includes>
......
......@@ -22,7 +22,7 @@ Many of Druid's external dependencies can be plugged in as modules. Extensions c
|Property|Description|Default|
|--------|-----------|-------|
|`druid.extensions.directory`|The root extension directory where user can put extensions related files. Druid will load extensions stored under this directory.|`extensions` (This is a relative path to Druid's working directory)|
|`druid.extensions.hadoopDependenciesDir`|The root hadoop dependencies directory where user can put hadoop related dependencies files. Druid will load the dependencies based on the hadoop coordinate specified in the hadoop index task.|`hadoop_dependencies` (This is a relative path to Druid's working directory|
|`druid.extensions.hadoopDependenciesDir`|The root hadoop dependencies directory where user can put hadoop related dependencies files. Druid will load the dependencies based on the hadoop coordinate specified in the hadoop index task.|`hadoop-dependencies` (This is a relative path to Druid's working directory|
|`druid.extensions.loadList`|A JSON array of extensions to load from extension directories by Druid. If it is not specified, its value will be `null` and Druid will load all the extensions under `druid.extensions.directory`. If its value is empty list `[]`, then no extensions will be loaded at all.|null|
|`druid.extensions.searchCurrentClassloader`|This is a boolean flag that determines if Druid will search the main classloader for extensions. It defaults to true but can be turned off if you have reason to not automatically add all modules on the classpath.|true|
......
......@@ -4,21 +4,40 @@ layout: doc_page
Production Cluster Configuration
================================
__This configuration is an example of what a production cluster could look like. Many other hardware combinations are possible! Cheaper hardware is absolutely possible.__
```note-info
This configuration is an example of what a production cluster could look like. Many other hardware combinations are
possible! Cheaper hardware is absolutely possible.
```
This production Druid cluster assumes that metadata storage and Zookeeper are already set up. The deep storage that is used for examples is S3 and memcached is used as a distributed cache.
This production Druid cluster assumes that metadata storage and Zookeeper are already set up. The deep storage that is
used for examples is [S3](https://aws.amazon.com/s3/) and [memcached](http://memcached.org/) is used for a distributed cache.
The nodes that respond to queries (Historical, Broker, and Middle manager nodes) will use as many cores as are available, depending on usage, so it is best to keep these on dedicated machines. The upper limit of effectively utilized cores is not well characterized yet and would depend on types of queries, query load, and the schema. Historical daemons should have a heap a size of at least 1GB per core for normal usage, but could be squeezed into a smaller heap for testing. Since in-memory caching is essential for good performance, even more RAM is better. Broker nodes will use RAM for caching, so they do more than just route queries. SSDs are highly recommended for Historical nodes not all data is loaded in available memory.
```note-info
The nodes in this example do not need to be on their own individual servers. Overlord and Coordinator nodes should be
co-located on the same hardware.
```
The nodes that are responsible for coordination (Coordinator and Overlord nodes) require much less processing.
The nodes that respond to queries (Historical, Broker, and MiddleManager nodes) will use as many cores as are available,
depending on usage, so it is best to keep these on dedicated machines. The upper limit of effectively utilized cores is
not well characterized yet and would depend on types of queries, query load, and the schema. Historical daemons should
have a heap size of at least 1GB per core for normal usage, but could be squeezed into a smaller heap for testing.
Since in-memory caching is essential for good performance, even more RAM is better.
Broker nodes will use RAM for caching, so they do more than just route queries.
SSDs are highly recommended for Historical nodes when all they have more segments loaded than available memory.
The effective utilization of cores by Zookeeper, metadata storage, and Coordinator nodes is likely to be between 1 and 2 for each process/daemon, so these could potentially share a machine with lots of cores. These daemons work with heap a size between 500MB and 1GB.
The nodes that are responsible for coordination (Coordinator and Overlord nodes) require much less processing.
We'll use r3.8xlarge nodes for query facing nodes and m1.xlarge nodes for coordination nodes. The following examples work relatively well in production, however, a more optimized tuning for the nodes we selected and more optimal hardware for a Druid cluster are both definitely possible.
The effective utilization of cores by Zookeeper, metadata storage, and Coordinator nodes is likely to be between 1 and 2
for each process/daemon, so these could potentially share a machine with lots of cores. These daemons work with heap
size between 500MB and 1GB.
For general purposes of high availability, there should be at least 2 of every node type.
We'll use [EC2](https://aws.amazon.com/ec2/) r3.8xlarge nodes for query facing nodes and m1.xlarge nodes for coordination nodes.
The following examples work relatively well in production, however, a more optimized tuning for the nodes we selected and
more optimal hardware for a Druid cluster are both definitely possible.
To setup a local Druid cluster, see [Simple Cluster Configuration](../configuration/simple-cluster.html).
```note-caution
For high availability, there should be at least a redundant copy of every process running on separate hardware.
```
### Common Configuration (common.runtime.properties)
......
---
layout: doc_page
---
```note-caution
If you are doing stream-pull based ingestion, we suggest using [stream-pushed](../ingestion/stream-push.html) based ingestion instead and not
using real-time nodes.
```
Realtime Node Configuration
==============================
For general Realtime Node information, see [here](../design/realtime.html).
......
---
layout: doc_page
---
Simple Cluster Configuration
===============================
This simple Druid cluster configuration can be used for initially experimenting with Druid on your local machine. For a more realistic production Druid cluster, see [Production Cluster Configuration](../configuration/production-cluster.html).
### Common Configuration (common.runtime.properties)
```
# Extensions
-Ddruid.extensions.loadList=["druid-kafka-eight"]
# Zookeeper (defaults to localhost)
# Metadata Storage (defaults to derby with no username and password)
```
### Overlord Node (Indexing Service)
Run:
```
io.druid.cli.Main server overlord
```
Configuration:
```
-server
-Xmx256m
-Duser.timezone=UTC
-Dfile.encoding=UTF-8
-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
-Ddruid.indexer.queue.startDelay=PT0M
-Ddruid.indexer.runner.javaOpts=-server -Xmx1g
-Ddruid.indexer.fork.property.druid.processing.numThreads=1
-Ddruid.indexer.fork.property.druid.computation.buffer.size=100000000
```
This runs the indexing service in local mode, and can support real-time ingestion tasks (with one processing thread for queries).
### Coordinator Node
Run:
```
io.druid.cli.Main server coordinator
```
Configuration:
```
-server
-Xmx256m
-Duser.timezone=UTC
-Dfile.encoding=UTF-8
-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
druid.coordinator.startDelay=PT70s
```
This simple coordinator assumes local deep storage.
### Historical Node
Run:
```
io.druid.cli.Main server historical
```
Configuration:
```
-server
-Xmx256m
-Duser.timezone=UTC
-Dfile.encoding=UTF-8
-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
druid.server.maxSize=10000000000
druid.processing.buffer.sizeBytes=100000000
druid.processing.numThreads=1
druid.segmentCache.locations=[{"path": "/tmp/druid/indexCache", "maxSize"\: 10000000000}]
```
This historical node will be able to load 100 MB of data and be able to process 1 segment at a time. Deep storage is assumed to be local storage here.
### Broker Node
Run:
```
io.druid.cli.Main server broker
```
Configuration:
```
-server
-Xmx256m
-Duser.timezone=UTC
-Dfile.encoding=UTF-8
-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
druid.processing.buffer.sizeBytes=100000000
druid.processing.numThreads=1
```
This simple broker will run groupBys in a single thread.
......@@ -260,7 +260,7 @@ Disables a datasource.
* `/druid/coordinator/v1/datasources/{dataSourceName}?kill=true&interval={myISO8601Interval}>`
Runs a [Kill task](../misc/tasks.html) for a given interval and datasource.
Runs a [Kill task](../ingestion/tasks.html) for a given interval and datasource.
* `/druid/coordinator/v1/datasources/{dataSourceName}/segments/{segmentId}`
......
......@@ -2,7 +2,8 @@
layout: doc_page
---
For a comprehensive look at the architecture of Druid, read the [White Paper](http://static.druid.io/docs/druid.pdf).
For a comprehensive look at the architecture of Druid, read the [White Paper](http://static.druid.io/docs/druid.pdf). Please note
that Druid is undergoing rapid development and the white paper may be out date.
What is Druid?
==============
......@@ -19,7 +20,7 @@ Druid currently allows for single-table queries in a similar manner to [Dremel](
As far as a comparison of systems is concerned, Druid sits in between PowerDrill and Dremel on the spectrum of functionality. It implements almost everything Dremel offers (Dremel handles arbitrary nested data structures while Druid only allows for a single level of array-based nesting) and gets into some of the interesting data layout and compression methods from PowerDrill.
Druid is a good fit for products that require real-time data ingestion of a single, large data stream. Especially if you are targeting no-downtime operation and are building your product on top of a time-oriented summarization of the incoming data stream. Druid is probably not the right solution if you care more about query flexibility and raw data access than query speed and no-downtime operation. When talking about query speed it is important to clarify what "fast" means: with Druid it is entirely within the realm of possibility (we have done it) to achieve queries that run in less than a second across terabytes of data.
Druid is a good fit for products that require real-time data ingestion of a single, large data stream. Especially if you are targeting no-downtime operation and are building your product on top of a time-oriented summarization of the incoming data stream. When talking about query speed it is important to clarify what "fast" means: with Druid it is entirely within the realm of possibility (we have done it) to achieve queries that run in less than a second across trillions of rows of data.
### Architecture
......@@ -33,9 +34,10 @@ The node types that currently exist are:
* [**Realtime**](../design/realtime.html) nodes ingest data in real time. They are in charge of listening to a stream of incoming data and making it available immediately inside the Druid system. Real-time nodes respond to query requests from Broker nodes, returning query results to those nodes. Aged data is pushed from Realtime nodes to deep storage. Realtime nodes monitor ZooKeeper to discover segments that they've pushed to deep storage have been loaded by Historicals&mdash;if so, they drop those segments.
* [**Coordinator**](../design/coordinator.html) nodes monitor the grouping of historical nodes to ensure that data is available, replicated and in a generally "optimal" configuration. They do this by reading segment metadata information from metadata storage to determine what segments should be loaded in the cluster, using Zookeeper to determine what Historical nodes exist, and creating Zookeeper entries to tell Historical nodes to load and drop new segments.
* [**Broker**](../design/broker.html) nodes receive queries from external clients and forward those queries to Realtime and Historical nodes. When Broker nodes receive results, they merge these results and return them to the caller. For knowing topology, Broker nodes use Zookeeper to determine what Realtime and Historical nodes exist.
* [**Indexer**](../design/indexing-service.html) nodes form a cluster of workers to load batch and real-time data into the system as well as allow for alterations to the data stored in the system (also known as the Indexing Service).
* [**Indexing Service**](../design/indexing-service.html) nodes form a cluster of workers to load batch and real-time data into the system as well as allow for alterations to the data stored in the system.
* [**Realtime**](../design/realtime.html) nodes also load real-time data into the system. They are simpler to set up than the indexing service, at the cost of several [limitations](../ingestion/stream-pull.html#limitations) for production use.
This separation allows each node to only care about what it is best at. By separating Historical and Realtime, we separate the memory concerns of listening on a real-time stream of data and processing it for entry into the system. By separating the Coordinator and Broker, we separate the needs for querying from the needs for maintaining "good" data distribution across the cluster.
This separation allows each node to only care about what it is best at. By separating Historical and Realtime processing, we separate the memory concerns of listening on a real-time stream of data and processing it for entry into the system. By separating the Coordinator and Broker, we separate the needs for querying from the needs for maintaining "good" data distribution across the cluster.
The following diagram shows how queries and data flow through this architecture, and which nodes (and external dependencies, discussed below) are involved:
......@@ -61,10 +63,9 @@ Getting data into the Druid system requires an indexing process, as shown in the
- Converted to columnar format
- Indexed with bitmap indexes
- Compressed using various algorithms
- LZF (switching to Snappy is on the roadmap, not yet implemented)
- Dictionary encoding w/ id storage minimization
- Bitmap compression
- RLE (on the roadmap, but not yet implemented)
- LZ4 for all columns
- Dictionary encoding w/ id storage minimization for String columns
- Bitmap compression for bitmap indexes
The output of the indexing process is called a "segment". Segments are the fundamental structure to store data in Druid. Segments contain the various dimensions and metrics in a data set, stored in a column orientation, as well as the indexes for those columns.
......@@ -79,6 +80,7 @@ In order for a segment to exist inside of the cluster, an entry has to be added
- **Historical** As discussed above, if a historical node dies, another historical node can take its place and there is no fear of data loss.
- **Coordinator** Can be run in a hot fail-over configuration. If no coordinators are running, then changes to the data topology will stop happening (no new data and no data balancing decisions), but the system will continue to run.
- **Broker** Can be run in parallel or in hot fail-over.
- **Indexing Service** Workers run with replicated ingestion tasks, coordination piece has hot fail-over.
- **Realtime** Depending on the semantics of the delivery stream, multiple of these can be run in parallel processing the exact same stream. They periodically checkpoint to disk and eventually push out to deep storage. Steps are taken to be able to recover from process death, but loss of access to the local disk can result in data loss if this is the only method of adding data to the system.
- **"deep storage" file system** If this is not available, new data will not be able to enter the cluster, but the cluster will continue operating as is.
- **metadata storage** If this is not available, the Coordinator will be unable to find out about new segments in the system, but it will continue with its current view of the segments that should exist in the cluster.
......@@ -86,7 +88,7 @@ In order for a segment to exist inside of the cluster, an entry has to be added
### Query processing
A query first enters the Broker, where the Broker will match the query with the data segments that are known to exist. It will then pick a set of machines that are serving those segments and rewrite the query for each server to specify the segment(s) targetted. The Historical/Realtime nodes will take in the query, process them and return results. The Broker then takes the results and merges them together to get the final answer, which it returns. In this way, the broker can prune all of the data that doesn’t match a query before ever even looking at a single row of data.
A query first enters the Broker, where the Broker will match the query with the data segments that are known to exist. It will then pick a set of machines that are serving those segments and rewrite the query for each server to specify the segment(s) targetted. The Historical/Realtime processes will take in the query, process them and return results. The Broker then takes the results and merges them together to get the final answer, which it returns. In this way, the broker can prune all of the data that doesn’t match a query before ever even looking at a single row of data.
For filters at a more granular level than what the Broker can prune based on, the indexing structures inside each segment allows the historical nodes to figure out which (if any) rows match the filter set before looking at any row of data. It can do all of the boolean algebra of the filter on the bitmap indices and never actually look directly at a row of data.
......
......@@ -3,9 +3,10 @@ layout: doc_page
---
Indexing Service
================
For Indexing Service Configuration, see [Indexing Service Configuration](../configuration/indexing-service.html).
The indexing service is a highly-available, distributed service that runs indexing related tasks. Indexing service [tasks](../misc/tasks.html) create (and sometimes destroy) Druid [segments](../design/segments.html). The indexing service has a master/slave like architecture.
The indexing service is a highly-available, distributed service that runs indexing related tasks. Indexing service [tasks](../ingestion/tasks.html) create (and sometimes destroy) Druid [segments](../design/segments.html). The indexing service has a master/slave like architecture.
The indexing service is composed of three main components: a peon component that can run a single task, a [Middle Manager](../design/middlemanager.html) component that manages peons, and an overlord component that manages task distribution to middle managers.
Overlords and middle managers may run on the same node or across multiple nodes while middle managers and [Peons](../design/peons.html) always run on the same node.
......@@ -86,7 +87,7 @@ See [Peon](../design/peons.html).
Tasks
-----
See [Tasks](../misc/tasks.html).
See [Tasks](../ingestion/tasks.html).
HTTP Endpoints
--------------
......@@ -96,4 +97,3 @@ HTTP Endpoints
* `/status`
Returns the Druid version, loaded extensions, memory used, total memory and other useful information about the node.
---
layout: doc_page
---
Real-time Node
==============
For Real-time Node Configuration, see [Realtime Configuration](../configuration/realtime.html).
For Real-time Ingestion, see [Realtime Ingestion](../ingestion/realtime-ingestion.html).
......
......@@ -4,7 +4,6 @@ layout: doc_page
Segments
========
Druid stores its index in *segment files*, which are partitioned by
time. In a basic setup, one segment file is created for each time
interval, where the time inteval is configurable in the
......@@ -169,12 +168,18 @@ A ColumnDescriptor is essentially an object that allows us to use jackson’s po
Sharding Data to Create Segments
--------------------------------
### Sharding Data by Dimension
### Sharding
Multiple segments may exist for the same interval of time for the same datasource. These segments form a `block` for an interval.
Depending on the type of `shardSpec` that is used to shard the data, Druid queries may only complete if a `block` is complete. That is to say, if a block consists of 3 segments, such as:
`sampleData_2011-01-01T02:00:00:00Z_2011-01-01T03:00:00:00Z_v1_0`
`sampleData_2011-01-01T02:00:00:00Z_2011-01-01T03:00:00:00Z_v1_1`
`sampleData_2011-01-01T02:00:00:00Z_2011-01-01T03:00:00:00Z_v1_2`
All 3 segments must be loaded before a query for the interval `2011-01-01T02:00:00:00Z_2011-01-01T03:00:00:00Z` completes.
If the cumulative total number of rows for the different values of a
given column exceed some configurable threshold, multiple segments
representing the same time interval for the same datasource may be
created. These segments will contain some partition number as part of
their identifier. Sharding by dimension reduces some of the the costs
associated with operations over high cardinality dimensions. For more
information on sharding, see the ingestion documentation.
The exception to this rule is with using linear shard specs. Linear shard specs do not force 'completeness' and queries can complete even if shards are not loaded in the system.
For example, if your real-time ingestion creates 3 segments that were sharded with linear shard spec, and only two of the segments were loaded in the system, queries would return results only for those 2 segments.
......@@ -4,7 +4,8 @@ layout: doc_page
### Build from Source
Druid can be set up by building from source via git.
You can build Druid directly from source. Please note that these instructions are for building the latest stable of Druid.
For building the latest code in master, follow the instructions [here](https://github.com/druid-io/druid/blob/master/docs/content/development/build.md).
Building Druid requires the following:
- [JDK 7](http://www.oracle.com/technetwork/java/javase/downloads/jdk7-downloads-1880260.html)
......@@ -22,10 +23,8 @@ mvn clean install
This will compile the project and create the Druid binary distribution tar under
`distribution/target/druid-VERSION-bin.tar.gz`.
This will also create `distribution/target/mysql-metadata-storage-bin.tar.gz`,
which is a tarball that contains the `mysql-metadata-storage` extension.
You can find the example executables in the examples/bin directory:
* run_example_server.sh
* run_example_client.sh
This will also create a tarball that contains `mysql-metadata-storage` extension under
`distribution/target/mysql-metadata-storage-bin.tar.gz`. If you want Druid to load `mysql-metadata-storage`, you can
first untar `druid-VERSION-bin.tar.gz`, then go to ```druid-<version>/extensions```, untar `mysql-metadata-storage-bin.tar.gz`
there. Now just specifiy `mysql-metadata-storage` in `druid.extensions.loadList` so that Druid will pick it up.
See [Including Extensions](../operations/including-extensions.html) for more information.
......@@ -2,6 +2,9 @@
layout: doc_page
---
Query Libraries
---------------
#### Python
* [druid-io/pydruid](https://github.com/druid-io/pydruid) - A python client for Druid
......@@ -18,7 +21,7 @@ Some great folks have written their own libraries to interact with Druid
#### JavaScript
* [facetjs/facetjs](https://github.com/facetjs/facetjs) - A general query planner for Druid written in JavaScript
* [implydata/plywood](https://github.com/implydata/plywood) - A higher level API for Druid. An extension of the work that was started in facet.js.
#### Node.js
......@@ -35,12 +38,8 @@ Some great folks have written their own libraries to interact with Druid
#### SQL
* [implydata/plyql](https://github.com/implydata/plyql) - A command line interface for issuing SQL queries to Druid via [plywood](https://github.com/implydata/plywood)
* [srikalyc/Sql4D](https://github.com/srikalyc/Sql4D) - A SQL client for Druid. Used in production at Yahoo.
* [facetjs/facet-cli](https://github.com/facetjs/facet-cli) - A command line interface for issuing SQL queries to Druid via [facetjs](https://github.com/facetjs/facetjs)
#### TypeScript
* [facetjs/typescript-druid](https://github.com/facetjs/typescript-druid) - TypeScript declarations for the Druid API
Community Helper Libraries
......@@ -50,6 +49,15 @@ Community Helper Libraries
* [housejester/druid-test-harness](https://github.com/housejester/druid-test-harness) - A set of scripts to simplify standing up some servers and seeing how things work
* [mingfang/docker-druid](https://github.com/mingfang/docker-druid) - A Dockerfile to run the entire Druid cluster
Other Druid Distributions
-------------------------
* [Imply Analytics Platform](http://imply.io/download) - The Imply Analytics platform repackages Druid, all its dependencies, and an UI and SQL layer.
Tools
---
* [Insert Segments](../../operations/insert-segment-to-db.html) - A tool that can insert segments' metadata into Druid metadata storage.
UIs
---
......@@ -64,8 +72,8 @@ Tools
* [Insert Segments](../../operations/insert-segment-to-db.html) - A tool that can insert segments' metadata into Druid metadata storage.
Community Extensions
--------------------
Other Community Extensions
--------------------------
These are extensions from the community. (If you would like yours listed please speak up!)
......
......@@ -3,113 +3,84 @@ layout: doc_page
---
# Batch Data Ingestion
There are two choices for batch data ingestion to your Druid cluster, you can use the [Indexing service](../design/indexing-service.html) or you can use the `HadoopDruidIndexer`.
Which should I use?
-------------------
Druid can load data from static files through a variety of methods described here.
The [Indexing service](../design/indexing-service.html) is a set of nodes that can run as part of your Druid cluster and can accomplish a number of different types of indexing tasks. Even if all you care about is batch indexing, it provides for the encapsulation of things like the [metadata store](../dependencies/metadata-storage.html) that is used for segment metadata and other things, so that your indexing tasks do not need to include such information. The indexing service was created such that external systems could programmatically interact with it and run periodic indexing tasks. Long-term, the indexing service is going to be the preferred method of ingesting data.
## Hadoop-based Batch Ingestion
The `HadoopDruidIndexer` runs hadoop jobs in order to separate and index data segments. It takes advantage of Hadoop as a job scheduling and distributed job execution platform. It is a simple method if you already have Hadoop running and don’t want to spend the time configuring and deploying the [Indexing service](../design/indexing-service.html) just yet.
## Batch Ingestion using the HadoopDruidIndexer
The HadoopDruidIndexer can be run like so:
```
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:<hadoop_config_path> io.druid.cli.Main index hadoop <spec_file>
```
## Hadoop "specFile"
The spec\_file is a path to a file that contains JSON and an example looks like:
Hadoop-based batch ingestion in Druid is supported via a Hadoop-ingestion task. These tasks can be posted to a running instance
of a Druid [overlord](../design/indexing-service.html). A sample task is shown below:
```json
{
"dataSchema" : {
"dataSource" : "wikipedia",
"parser" : {
"type" : "hadoopyString",
"parseSpec" : {
"format" : "json",
"timestampSpec" : {
"column" : "timestamp",
"format" : "auto"
"type" : "index_hadoop",
"spec" : {
"dataSchema" : {
"dataSource" : "wikipedia",
"parser" : {
"type" : "hadoopyString",
"parseSpec" : {
"format" : "json",
"timestampSpec" : {
"column" : "timestamp",
"format" : "auto"
},
"dimensionsSpec" : {
"dimensions": ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"],
"dimensionExclusions" : [],
"spatialDimensions" : []
}
}
},
"metricsSpec" : [
{
"type" : "count",
"name" : "count"
},
{
"type" : "doubleSum",
"name" : "added",
"fieldName" : "added"
},
{
"type" : "doubleSum",
"name" : "deleted",
"fieldName" : "deleted"
},
"dimensionsSpec" : {
"dimensions": ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"],
"dimensionExclusions" : [],
"spatialDimensions" : []
{
"type" : "doubleSum",
"name" : "delta",
"fieldName" : "delta"
}
],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "DAY",
"queryGranularity" : "NONE",
"intervals" : [ "2013-08-31/2013-09-01" ]
}
},
"metricsSpec" : [
{
"type" : "count",
"name" : "count"
},
{
"type" : "doubleSum",
"name" : "added",
"fieldName" : "added"
},
{
"type" : "doubleSum",
"name" : "deleted",
"fieldName" : "deleted"
},
{
"type" : "doubleSum",
"name" : "delta",
"fieldName" : "delta"
"ioConfig" : {
"type" : "hadoop",
"inputSpec" : {
"type" : "static",
"paths" : "/MyDirectory/example/wikipedia_data.json"
}
],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "DAY",
"queryGranularity" : "NONE",
"intervals" : [ "2013-08-31/2013-09-01" ]
}
},
"ioConfig" : {
"type" : "hadoop",
"inputSpec" : {
"type" : "static",
"paths" : "/MyDirectory/examples/indexing/wikipedia_data.json"
},
"metadataUpdateSpec" : {
"type":"mysql",
"connectURI" : "jdbc:mysql://localhost:3306/druid",
"password" : "diurd",
"segmentTable" : "druid_segments",
"user" : "druid"
},
"segmentOutputPath" : "/MyDirectory/data/index/output"
},
"tuningConfig" : {
"type" : "hadoop",
"workingPath": "/tmp",
"partitionsSpec" : {
"type" : "dimension",
"partitionDimension" : null,
"targetPartitionSize" : 5000000,
"maxPartitionSize" : 7500000,
"assumeGrouped" : false,
"numShards" : -1
},
"shardSpecs" : { },
"leaveIntermediate" : false,
"cleanupOnFailure" : true,
"overwriteFiles" : false,
"ignoreInvalidRows" : false,
"jobProperties" : { },
"combineText" : false,
"rowFlushBoundary" : 300000,
"buildV9Directly" : false
"tuningConfig" : {
"type": "hadoop"
}
}
}
```
|property|description|required?|
|--------|-----------|---------|
|type|The task type, this should always be "index_hadoop".|yes|
|spec|A Hadoop Index Spec. See [Batch Ingestion](../ingestion/batch-ingestion.html)|yes|
|hadoopDependencyCoordinates|A JSON array of Hadoop dependency coordinates that Druid will use, this property will override the default Hadoop coordinates. Once specified, Druid will look for those Hadoop dependencies from the location specified by `druid.extensions.hadoopDependenciesDir`|no|
|classpathPrefix|Classpath that will be pre-appended for the peon process.|no|
### DataSchema
This field is required.
......@@ -138,7 +109,6 @@ Is a type of inputSpec where a static path to where the data files are located i
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|paths|Array of String|A String of input paths indicating where the raw data is located.|yes|
|inputFormat|String|The input format of the data files. Default is `org.apache.hadoop.mapreduce.lib.input.TextInputFormat`, or `org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat` if `combineText` in tuningConfig is `true`.|no|
For example, using the static input paths:
......@@ -156,7 +126,6 @@ Is a type of inputSpec that expects data to be laid out in a specific path forma
|inputPath|String|Base path to append the expected time path to.|yes|
|filePattern|String|Pattern that files should match to be included.|yes|
|pathFormat|String|Joda date-time format for each directory. Default value is `"'y'=yyyy/'m'=MM/'d'=dd/'H'=HH"`, or see [Joda documentation](http://www.joda.org/joda-time/apidocs/org/joda/time/format/DateTimeFormat.html)|no|
|inputFormat|String|The input format of the data files. Default is `org.apache.hadoop.mapreduce.lib.input.TextInputFormat`, or `org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat` if `combineText` in tuningConfig is `true`.|no|
For example, if the sample config were run with the interval 2012-06-01/2012-06-02, it would expect data at the paths
......@@ -175,21 +144,6 @@ Read Druid segments. See [here](../ingestion/update-existing-data.html) for more
Read multiple sources of data. See [here](../ingestion/update-existing-data.html) for more information.
#### Metadata Update Job Spec
This is a specification of the properties that tell the job how to update metadata such that the Druid cluster will see the output segments and load them.
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|type|String|"metadata" is the only value available.|yes|
|connectURI|String|A valid JDBC url to metadata storage.|yes|
|user|String|Username for db.|yes|
|password|String|password for db.|yes|
|segmentTable|String|Table to use in DB.|yes|
These properties should parrot what you have configured for your [Coordinator](../design/coordinator.html).
### TuningConfig
The tuningConfig is optional and default parameters will be used if no tuningConfig is specified.
......@@ -266,123 +220,64 @@ The configuration options are:
### Remote Hadoop Cluster
If you have a remote Hadoop cluster, make sure to include the folder holding your configuration `*.xml` files in the classpath of the indexer.
If you have a remote Hadoop cluster, make sure to include the folder holding your configuration `*.xml` files in your Druid `_common` configuration folder.
If you having dependency problems with your version of Hadoop and the version compiled with Druid, please see [these docs](../operations/other-hadoop.html).
Batch Ingestion Using the Indexing Service
------------------------------------------
### Using Elastic MapReduce
Batch ingestion for the indexing service is done by submitting an [Index Task](../misc/tasks.html) (for datasets < 1G) or a [Hadoop Index Task](../misc/tasks.html). The indexing service can be started by issuing:
If your cluster is running on Amazon Web Services, you can use Elastic MapReduce (EMR) to index data
from S3. To do this:
- Create a persistent, [long-running cluster](http://docs.aws.amazon.com/ElasticMapReduce/latest/ManagementGuide/emr-plan-longrunning-transient.html).
- When creating your cluster, enter the following configuration. If you're using the wizard, this
should be in advanced mode under "Edit software settings".
```
java -Xmx2g -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/overlord io.druid.cli.Main server overlord
classification=yarn-site,properties=[mapreduce.reduce.memory.mb=6144,mapreduce.reduce.java.opts=-server -Xms2g -Xmx2g -Duser.timezone=UTC -Dfile.encoding=UTF-8 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps,mapreduce.map.java.opts=758,mapreduce.map.java.opts=-server -Xms512m -Xmx512m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps,mapreduce.task.timeout=1800000]
```
This will start up a very simple local indexing service. For more complex deployments of the indexing service, see [here](../design/indexing-service.html).
- Follow the instructions under "[Configure Hadoop for data
loads](cluster.html#configure-cluster-for-hadoop-data-loads)" using the XML files from
`/etc/hadoop/conf` on your EMR master.
The schema of the Hadoop Index Task contains a task "type" and a Hadoop Index Config. A sample Hadoop index task is shown below:
#### Loading from S3 with EMR
```json
{
"type" : "index_hadoop",
"spec" : {
"dataSchema" : {
"dataSource" : "wikipedia",
"parser" : {
"type" : "hadoopyString",
"parseSpec" : {
"format" : "json",
"timestampSpec" : {
"column" : "timestamp",
"format" : "auto"
},
"dimensionsSpec" : {
"dimensions": ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"],
"dimensionExclusions" : [],
"spatialDimensions" : []
}
}
},
"metricsSpec" : [
{
"type" : "count",
"name" : "count"
},
{
"type" : "doubleSum",
"name" : "added",
"fieldName" : "added"
},
{
"type" : "doubleSum",
"name" : "deleted",
"fieldName" : "deleted"
},
{
"type" : "doubleSum",
"name" : "delta",
"fieldName" : "delta"
}
],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "DAY",
"queryGranularity" : "NONE",
"intervals" : [ "2013-08-31/2013-09-01" ]
}
},
"ioConfig" : {
"type" : "hadoop",
"inputSpec" : {
"type" : "static",
"paths" : "/MyDirectory/examples/indexing/wikipedia_data.json"
}
},
"tuningConfig" : {
"type": "hadoop"
}
}
- In the `jobProperties` field in the `tuningConfig` section of your Hadoop indexing task, add:
```
"jobProperties" : {
"fs.s3.awsAccessKeyId" : "YOUR_ACCESS_KEY",
"fs.s3.awsSecretAccessKey" : "YOUR_SECRET_KEY",
"fs.s3.impl" : "org.apache.hadoop.fs.s3native.NativeS3FileSystem",
"fs.s3n.awsAccessKeyId" : "YOUR_ACCESS_KEY",
"fs.s3n.awsSecretAccessKey" : "YOUR_SECRET_KEY",
"fs.s3n.impl" : "org.apache.hadoop.fs.s3native.NativeS3FileSystem",
"io.compression.codecs" : "org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec"
}
```
### DataSchema
Note that this method uses Hadoop's builtin S3 filesystem rather than Amazon's EMRFS, and is not compatible
with Amazon-specific features such as S3 encryption and consistent views. If you need to use those
features, you will need to make the Amazon EMR Hadoop JARs available to Druid through one of the
mechanisms described in the [Using other Hadoop distributions](#using-other-hadoop-distributions) section.
This field is required.
### Using other Hadoop distributions
See [Ingestion](../ingestion/index.html)
Druid works out of the box with many Hadoop distributions.
### IOConfig
This field is required.
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|type|String|This should always be 'hadoop'.|yes|
|pathSpec|Object|a specification of where to pull the data in from|yes|
### TuningConfig
The tuningConfig is optional and default parameters will be used if no tuningConfig is specified. This is the same as the tuningConfig for the standalone Hadoop indexer. See above for more details.
If you are having dependency conflicts between Druid and your version of Hadoop, you can try
searching for a solution in the [Druid user groups](https://groups.google.com/forum/#!forum/druid-
user), or reading the Druid [Different Hadoop Versions](..//operations/other-hadoop.html) documentation.
### Running the Task
## Command Line Hadoop Indexer
The Hadoop Index Config submitted as part of an Hadoop Index Task is identical to the Hadoop Index Config used by the `HadoopDruidIndexer` except that three fields must be omitted: `segmentOutputPath`, `workingPath`, `metadataUpdateSpec`. The Indexing Service takes care of setting these fields internally.
If you don't want to use a full indexing service to use Hadoop to get data into Druid, you can also use the standalone command line Hadoop indexer.
See [here](../ingestion/command-line-hadoop-indexer.html) for more info.
To run the task:
```
curl -X 'POST' -H 'Content-Type:application/json' -d @example_index_hadoop_task.json localhost:8090/druid/indexer/v1/task
```
If the task succeeds, you should see in the logs of the indexing service:
```
2013-10-16 16:38:31,945 INFO [pool-6-thread-1] io.druid.indexing.overlord.exec.TaskConsumer - Task SUCCESS: HadoopIndexTask...
```
### Remote Hadoop Cluster
## IndexTask-based Batch Ingestion
If you have a remote Hadoop cluster, make sure to include the folder holding your configuration `*.xml` files in the classpath of the middle manager.
If you do not want to have a dependency on Hadoop for batch ingestion, you can also use the index task. This task will be much slower and less scalable than the Hadoop-based method. See [here](../ingestion/tasks.html)for more info.
Having Problems?
----------------
Getting data into Druid can definitely be difficult for first time users. Please don't hesitate to ask questions in our IRC channel or on our [google groups page](https://groups.google.com/forum/#!forum/druid-development).
Getting data into Druid can definitely be difficult for first time users. Please don't hesitate to ask questions in our IRC channel or on our [google groups page](https://groups.google.com/forum/#!forum/druid-user).
---
layout: doc_page
---
# Command Line Hadoop Indexer
To run:
```
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:<hadoop_config_dir> io.druid.cli.Main index hadoop <spec_file>
```
The spec file needs to contain a JSON object where the contents are the same as the "spec" field in the Hadoop index task.
In addition, the following fields need to be added to the ioConfig:
```
"ioConfig" : {
...
"metadataUpdateSpec" : {
"type":"mysql",
"connectURI" : "jdbc:mysql://localhost:3306/druid",
"password" : "diurd",
"segmentTable" : "druid_segments",
"user" : "druid"
},
"segmentOutputPath" : "/MyDirectory/data/index/output"
},
```
and the following field need to be added to the tuningConfig:
```
"tuningConfig" : {
...
"workingPath": "/tmp",
...
}
```
#### Metadata Update Job Spec
This is a specification of the properties that tell the job how to update metadata such that the Druid cluster will see the output segments and load them.
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|type|String|"metadata" is the only value available.|yes|
|connectURI|String|A valid JDBC url to metadata storage.|yes|
|user|String|Username for db.|yes|
|password|String|password for db.|yes|
|segmentTable|String|Table to use in DB.|yes|
These properties should parrot what you have configured for your [Coordinator](../design/coordinator.html).
#### segmentOutputPath Config
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|segmentOutputPath|String|the path to dump segments into.|yes|
#### workingPath Config
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|workingPath|String|the working path to use for intermediate results (results between Hadoop jobs).|no (default == '/tmp/druid-indexing')|
Please note that the command line Hadoop indexer doesn't have the locking capabilities of the indexing service, so if you choose to use it,
you have to take caution to not override segments created by real-time processing (if you that a real-time pipeline set up).
......@@ -6,11 +6,9 @@ layout: doc_page
### Realtime Ingestion
If you are trying to stream in historical (not current time) data into Druid and you are using the [serverTime](../ingestion/realtime-ingestion.html) rejection policy in your ingestion spec (the default rejection policy), Druid will not ingest this data as it is outside of the acceptable window period. You can verify this is what is happening by looking at the logs of your real-time process for log lines containing "ingest/events/*". These metrics will indicate the events ingested, rejected, etc. We recommend using batch ingestion methods for historical data in production.
If you are doing a POC, you can use the [messageTime](../ingestion/realtime-ingestion.html) rejection policy, but please be aware of the hand-off caveats. This rejection policy is not recommended in production.
If you are experimenting with realtime ingestion, you can also use the [none](../ingestion/realtime-ingestion.html) rejection policy to load all incoming events, but hand-off will never occur.
The most common cause of this is because events being ingested are out of band of Druid's `windowPeriod`. Druid realtime ingestion
only accepts events within a configurable windowPeriod of the current time. You can verify this is what is happening by looking at the logs of your real-time process for log lines containing "ingest/events/*". These metrics will indicate the events ingested, rejected, etc.
We recommend using batch ingestion methods for historical data in production.
### Batch Ingestion
......@@ -30,17 +28,17 @@ If the number of ingested events seem correct, make sure your query is correctly
Depending on what `druid.storage.type` is set to, Druid will upload segments to some [Deep Storage](../dependencies/deep-storage.html). Local disk is used as the default deep storage.
## My realtime node is not handing segments off
## My stream ingest is not handing segments off
First, make sure there are no exceptions in the logs of your node. Also make sure that `druid.storage.type` is set to a deep storage that makes sense.
First, make sure there are no exceptions in the logs of the ingestion process. Also make sure that `druid.storage.type` is set to a deep storage that isn't `local` if you are running a distributed cluster.
Other common reasons that hand-off fails are as follows:
1) Druid is unable to write to the metadata storage. Make sure your configuration is correct.
1) Druid is unable to write to the metadata storage. Make sure your configurations are correct.
2) Historical nodes are out of capacity and cannot download any more segments. You'll see exceptions in the coordinator logs if this occurs.
2) Historical nodes are out of capacity and cannot download any more segments. You'll see exceptions in the coordinator logs if this occurs and the coordinator console will show the historicals are near capacity.
3) Segments are corrupt and cannot download. You'll see exceptions in your historical nodes if this occurs.
3) Segments are corrupt and cannot be downloaded. You'll see exceptions in your historical nodes if this occurs.
4) Deep storage is improperly configured. Make sure that your segment actually exists in deep storage and that the coordinator logs have no errors.
......@@ -49,6 +47,7 @@ Other common reasons that hand-off fails are as follows:
Make sure to include the `druid-hdfs-storage` and all the hadoop configuration, dependencies (that can be obtained by running command `hadoop classpath` on a machine where hadoop has been setup) in the classpath. And, provide necessary HDFS settings as described in [Deep Storage](../dependencies/deep-storage.html) .
## I don't see my Druid segments on my historical nodes
You can check the coordinator console located at `<COORDINATOR_IP>:<PORT>`. Make sure that your segments have actually loaded on [historical nodes](../design/historical.html). If your segments are not present, check the coordinator logs for messages about capacity of replication errors. One reason that segments are not downloaded is because historical nodes have maxSizes that are too small, making them incapable of downloading more data. You can change that with (for example):
```
......@@ -58,7 +57,7 @@ You can check the coordinator console located at `<COORDINATOR_IP>:<PORT>`. Make
## My queries are returning empty results
You can check `<BROKER_IP>:<PORT>/druid/v2/datasources/<YOUR_DATASOURCE>?interval=0/3000` for the dimensions and metrics that have been created for your datasource. Make sure that the name of the aggregators you use in your query match one of these metrics. Also make sure that the query interval you specify match a valid time range where data exists. Note: the broker endpoint will only return valid results on historical segments and not segments served by real-time nodes.
You can use a [segment metadata query](../querying/segmentmetadataquery.html) for the dimensions and metrics that have been created for your datasource. Make sure that the name of the aggregators you use in your query match one of these metrics. Also make sure that the query interval you specify match a valid time range where data exists.
## How can I Reindex existing data in Druid with schema changes?
......
---
layout: doc_page
---
# Ingestion Overview
There are a couple of different ways to get data into Druid. We hope to unify things in the near future, but for the time being
the method you choose to ingest your data into Druid should be driven by your use case.
## Streaming Data
If you have a continuous stream of data, there are a few options to get your data into Druid. It should be noted that the current state of real-time ingestion in Druid does not guarantee exactly once processing. The real-time pipeline is meant to surface insights on
events as they are occurring. For an accurate copy of ingested data, an accompanying batch pipeline is required. We are working towards a streaming only word, but for
the time being, we recommend running a lambda architecture.
### Ingest from a Stream Processor
If you process your data using a stream processor such as Apache Samza or Apache Storm, you can use the [Tranquility](https://github.com/druid-io/tranquility) library to manage
your real-time ingestion. This setup requires using the indexing service for ingestion, which is what is used in production by many organizations that use Druid.
### Ingest from Apache Kafka
If you wish to ingest directly from Kafka using Tranquility, you will have to write a consumer that reads from Kafka and passes the data to Tranquility.
The other option is to use [standalone Realtime nodes](../design/realtime.html).
It should be noted that standalone realtime nodes use the Kafka high level consumer, which imposes a few restrictions.
Druid replicates segment such that logically equivalent data segments are concurrently hosted on N nodes. If N–1 nodes go down,
the data will still be available for querying. On real-time nodes, this process depends on maintaining logically equivalent
data segments on each of the N nodes, which is not possible with standard Kafka consumer groups if your Kafka topic requires more than one consumer
(because consumers in different consumer groups will split up the data differently).
For example, let's say your topic is split across Kafka partitions 1, 2, & 3 and you have 2 real-time nodes with linear shard specs 1 & 2.
Both of the real-time nodes are in the same consumer group. Real-time node 1 may consume data from partitions 1 & 3, and real-time node 2 may consume data from partition 2.
Querying for your data through the broker will yield correct results.
The problem arises if you want to replicate your data by creating real-time nodes 3 & 4. These new real-time nodes also
have linear shard specs 1 & 2, and they will consume data from Kafka using a different consumer group. In this case,
real-time node 3 may consume data from partitions 1 & 2, and real-time node 4 may consume data from partition 2.
From Druid's perspective, the segments hosted by real-time nodes 1 and 3 are the same, and the data hosted by real-time nodes
2 and 4 are the same, although they are reading from different Kafka partitions. Querying for the data will yield inconsistent
results.
Is this always a problem? No. If your data is small enough to fit on a single Kafka partition, you can replicate without issues.
Otherwise, you can run real-time nodes without replication.
## Large Batch of Static Data
If you have a large batch of historical data that you want to load all at once into Druid, you should use Druid's built in support for
Hadoop-based indexing. Hadoop-based indexing for large (> 1G) of batch data is the fastest way to load data into Druid. If you wish to avoid
the Hadoop dependency, or if you do not have a Hadoop cluster present, you can look at using the [index task](). The index task will be much slower
than Hadoop indexing for ingesting batch data.
One pattern that we've seen is to store raw events (or processed events) in deep storage (S3, HDFS, etc) and periodically run batch processing jobs over these raw events.
You can, for example, create a directory structure for your raw data, such as the following:
```
/prod/<dataSource>/v=1/y=2015/m=03/d=21/H=20/data.gz
/prod/<dataSource>/v=1/y=2015/m=03/d=21/H=21/data.gz
/prod/<dataSource>/v=1/y=2015/m=03/d=21/H=22/data.gz
```
In this example, hourly raw events are stored in individual gzipped files. Periodic batch processing jobs can then run over these files.
## Lambda Architecture
We recommend running a streaming real-time pipeline to run queries over events as they are occurring and a batch pipeline to perform periodic
cleanups of data.
## Sharding
Multiple segments may exist for the same interval of time for the same datasource. These segments form a `block` for an interval.
Depending on the type of `shardSpec` that is used to shard the data, Druid queries may only complete if a `block` is complete. That is to say, if a block consists of 3 segments, such as:
`sampleData_2011-01-01T02:00:00:00Z_2011-01-01T03:00:00:00Z_v1_0`
`sampleData_2011-01-01T02:00:00:00Z_2011-01-01T03:00:00:00Z_v1_1`
`sampleData_2011-01-01T02:00:00:00Z_2011-01-01T03:00:00:00Z_v1_2`
All 3 segments must be loaded before a query for the interval `2011-01-01T02:00:00:00Z_2011-01-01T03:00:00:00Z` completes.
The exception to this rule is with using linear shard specs. Linear shard specs do not force 'completeness' and queries can complete even if shards are not loaded in the system.
For example, if your real-time ingestion creates 3 segments that were sharded with linear shard spec, and only two of the segments were loaded in the system, queries would return results only for those 2 segments.
---
layout: doc_page
---
# Loading streams
Streams can be ingested in Druid using either [Tranquility](https://github.com/druid-io/tranquility (a Druid-aware
client) and the [indexing service](../design/indexing-service.html) or through standalone [Realtime nodes](../design/realtime.html).
The first approach will be more complex to set up, but also offers scalability and high availability characteristics that advanced production
setups may require. The second approach has some known [limitations](../ingestion/stream-pull.html#limitations).
## Stream push
If you have a program that generates a stream, then you can push that stream directly into Druid in
real-time. With this approach, Tranquility is embedded in your data-producing application.
Tranquility comes with bindings for the
Storm and Samza stream processors. It also has a direct API that can be used from any JVM-based
program, such as Spark Streaming or a Kafka consumer.
Tranquility handles partitioning, replication, service discovery, and schema rollover for you,
seamlessly and without downtime. You only have to define your Druid schema.
For examples and more information, please see the [Tranquility README](https://github.com/druid-io/tranquility).
## Stream pull
If you have an external service that you want to pull data from, you have two options. The simplest
option is to set up a "copying" service that reads from the data source and writes to Druid using
the [stream push method](#stream-push).
Another option is *stream pull*. With this approach, a Druid Realtime Node ingests data from a
[Firehose](../ingestion/firehose.html) connected to the data you want to
read. Druid includes builtin firehoses for Kafka, RabbitMQ, and various other streaming systems.
## More information
For more information on loading streaming data via a push based approach, please see [here](../ingestion/stream-push.html).
For more information on loading streaming data via a pull based approach, please see [here](../ingestion/stream-pull.html).
......@@ -2,20 +2,29 @@
layout: doc_page
---
Realtime Data Ingestion
=======================
For general Real-time Node information, see [here](../design/realtime.html).
For Real-time Node Configuration, see [Realtime Configuration](../configuration/realtime.html).
Stream Pull Ingestion
=====================
For writing your own plugins to the real-time node, see [Firehose](../ingestion/firehose.html).
If you have an external service that you want to pull data from, you have two options. The simplest
option is to set up a "copying" service that reads from the data source and writes to Druid using
the [stream push method](stream-push.html).
There are two ways of ingesting real-time data. This can be achieved with a standalone real-time node, or using the [Tranquility](https://github.com/druid-io/tranquility) client library as part of the [Indexing Service](../design/indexing-service.html). For a full explanation of why there are two methods, please see [this link](https://groups.google.com/forum/#!searchin/druid-development/fangjin$20yang$20%22thoughts%22/druid-development/aRMmNHQGdhI/muBGl0Xi_wgJ). If you are comfortable with the limitations of standalone real-time nodes, you can use them as they are easier to set up. The indexing service is a more robust and highly available solution but will also require more effort to set up.
Another option is *stream pull*. With this approach, a Druid Realtime Node ingests data from a
[Firehose](../ingestion/firehose.html) connected to the data you want to
read. The Druid quickstart and tutorials do not include information about how to set up standalone realtime nodes, but
they can be used in place for Tranquility server and the indexing service. Please note that Realtime nodes have very properties than
the indexing service.
## Realtime Node Ingestion
Much of the configuration governing Realtime nodes and the ingestion of data is set in the Realtime spec file, discussed on this page.
For general Real-time Node information, see [here](../design/realtime.html).
For Real-time Node Configuration, see [Realtime Configuration](../configuration/realtime.html).
For writing your own plugins to the real-time node, see [Firehose](../ingestion/firehose.html).
<a id="realtime-specfile"></a>
## Realtime "specFile"
......@@ -119,7 +128,7 @@ This field is required.
#### Firehose
See [Firehose](../ingestion/firehose.html) for more information on firehose configuration.
See [Firehose](../ingestion/firehose.html) for more information on various firehoses.
#### Plumber
......@@ -158,6 +167,7 @@ The following policies are available:
####<a id="sharding"></a> Sharding
Druid uses shards, or segments with partition numbers, to more efficiently handle large amounts of incoming data. In Druid, shards represent the segments that together cover a time interval based on the value of `segmentGranularity`. If, for example, `segmentGranularity` is set to "hour", then a number of shards may be used to store the data for that hour. Sharding along dimensions may also occur to optimize efficiency.
Segments are identified by datasource, time interval, and version. With sharding, a segment is also identified by a partition number. Typically, each shard will have the same version but a different partition number to uniquely identify it.
......@@ -175,6 +185,7 @@ Druid uses sharding based on the `shardSpec` setting you configure. The recommen
Keep in mind, that sharding configuration has nothing to do with configured firehose. For example, if you set partition number to 0, it doesn't mean that Kafka firehose will consume only from 0 topic partition.
##### Linear
This strategy provides following advantages:
* There is no need to update the fileSpec configurations of existing nodes when adding new nodes.
......@@ -191,6 +202,7 @@ Configure `linear` under `schema`:
##### Numbered
This strategy is similar to `linear` except that it does not tolerate non-sequential partition numbering (it will *not* allow querying of partitions 0 and 2 if partition 1 is missing). It also requires explicitly setting the total number of partitions.
Configure `numbered` under `schema`:
......@@ -205,6 +217,7 @@ Configure `numbered` under `schema`:
##### Scale and Redundancy
The `shardSpec` configuration can be used to create redundancy by having the same `partitionNum` values on different nodes.
For example, if RealTimeNode1 has:
......@@ -240,11 +253,6 @@ then it can store segments with the same datasource, time interval, and version
You can use type `numbered` similarly. Note that type `none` is essentially type `linear` with all shards having a fixed `partitionNum` of 0.
## Realtime Ingestion using the Indexing Service
We strongly recommend using the client library [Tranquility](https://github.com/druid-io/tranquility) for this use case. Please read the documentation on the Tranquility web page.
## Constraints
The following table summarizes constraints between settings in the spec file for the Realtime subsystem.
......@@ -259,3 +267,48 @@ The following table summarizes constraints between settings in the spec file for
The normal, expected use cases have the following overall constraints: `intermediatePersistPeriod ≤ windowPeriod < segmentGranularity` and `queryGranularity ≤ segmentGranularity`
## Limitations
### Kafka
Standalone realtime nodes use the Kafka high level consumer, which imposes a few restrictions.
Druid replicates segment such that logically equivalent data segments are concurrently hosted on N nodes. If N–1 nodes go down,
the data will still be available for querying. On real-time nodes, this process depends on maintaining logically equivalent
data segments on each of the N nodes, which is not possible with standard Kafka consumer groups if your Kafka topic requires more than one consumer
(because consumers in different consumer groups will split up the data differently).
For example, let's say your topic is split across Kafka partitions 1, 2, & 3 and you have 2 real-time nodes with linear shard specs 1 & 2.
Both of the real-time nodes are in the same consumer group. Real-time node 1 may consume data from partitions 1 & 3, and real-time node 2 may consume data from partition 2.
Querying for your data through the broker will yield correct results.
The problem arises if you want to replicate your data by creating real-time nodes 3 & 4. These new real-time nodes also
have linear shard specs 1 & 2, and they will consume data from Kafka using a different consumer group. In this case,
real-time node 3 may consume data from partitions 1 & 2, and real-time node 4 may consume data from partition 2.
From Druid's perspective, the segments hosted by real-time nodes 1 and 3 are the same, and the data hosted by real-time nodes
2 and 4 are the same, although they are reading from different Kafka partitions. Querying for the data will yield inconsistent
results.
Is this always a problem? No. If your data is small enough to fit on a single Kafka partition, you can replicate without issues.
Otherwise, you can run real-time nodes without replication.
There is now also an [experimental low level Kafka firehose](../development/kafka-simple-consumer-firehose.html) which
solves the issues described above with using the high level Kafka consumer.
### Locking
Using stream pull ingestion with Realtime nodes together batch ingestion may introduce data override issues. For example, if you
are generating hourly segments for the current day, and run a daily batch job for the current day's data, the segments created by
the batch job will have a more recent version than most of the segments generated by realtime ingestion. If your batch job is indexing
data that isn't yet complete for the day, the daily segment created by the batch job can override recent segments created by
realtime nodes. A portion of data will appear to be lost in this case.
### Schema changes
Standalone realtime nodes require stopping a node to update a schema, and starting it up again for the schema to take effect.
This can be difficult to manage at scale, especially with multiple partitions.
### Log management
Each standalone realtime node has its own set of logs. Diagnosing errors across many partitions across many servers may be
difficult to manage and track at scale.
---
layout: doc_page
---
## Stream Push
Druid can connect to any streaming data source through
[Tranquility](https://github.com/druid-io/tranquility/blob/master/README.md), a package for pushing
streams to Druid in real-time. Druid does not come bundled with Tranquility, and you will have to download the distribution.
```note-info
If you've never loaded streaming data into Druid, we recommend trying out the
[stream loading tutorial](../tutorials/tutorial-streams.html) first and then coming back to this page.
```
Note that with all streaming ingestion options, you must ensure that incoming data is recent
enough (within a [configurable windowPeriod](#segmentgranularity-and-windowperiod) of the current
time). Older messages will not be processed in real-time. Historical data is best processed with
[batch ingestion](../ingestion/batch-ingestion.html).
### Server
Druid can use [Tranquility Server](https://github.com/druid-io/tranquility/blob/master/docs/server.md), which
lets you send data to Druid without developing a JVM app. You can run Tranquility server colocated with Druid middleManagers
and historical processes.
Tranquility server is started by issuing:
```bash
bin/tranquility server -configFile <path_to_config_file>/server.json
```
To customize Tranquility Server:
- In `server.json`, customize the `properties` and `dataSources`.
- If you have servers already running Tranquility, stop them (CTRL-C) and start
them up again.
For tips on customizing `server.json`, see the
*[Loading your own streams](../tutorials/tutorial-streams.html)* tutorial and the
[Tranquility Server documentation](https://github.com/druid-io/tranquility/blob/master/docs/server.md).
### Kafka
[Tranquility Kafka](https://github.com/druid-io/tranquility/blob/master/docs/kafka.md)
lets you load data from Kafka into Druid without writing any code. You only need a configuration
file.
Tranquility server is started by issuing:
```bash
bin/tranquility kafka -configFile <path_to_config_file>/kafka.json
```
To customize Tranquility Kafka in the single-machine quickstart configuration:
- In `kafka.json`, customize the `properties` and `dataSources`.
- If you have Tranquility already running, stop it (CTRL-C) and start it up again.
For tips on customizing `kafka.json`, see the
[Tranquility Kafka documentation](https://github.com/druid-io/tranquility/blob/master/docs/kafka.md).
### JVM apps and stream processors
Tranquility can also be embedded in JVM-based applications as a library. You can do this directly
in your own program using the
[Core API](https://github.com/druid-io/tranquility/blob/master/docs/core.md), or you can use
the connectors bundled in Tranquility for popular JVM-based stream processors such as
[Storm](https://github.com/druid-io/tranquility/blob/master/docs/storm.md),
[Samza](https://github.com/druid-io/tranquility/blob/master/docs/samza.md),
[Spark Streaming](https://github.com/druid-io/tranquility/blob/master/docs/spark.md), and
[Flink](https://github.com/druid-io/tranquility/blob/master/docs/flink.md).
## Concepts
### Task creation
Tranquility automates creation of Druid realtime indexing tasks, handling partitioning, replication,
service discovery, and schema rollover for you, seamlessly and without downtime. You never have to
write code to deal with individual tasks directly. But, it can be helpful to understand how
Tranquility creates tasks.
Tranquility spawns relatively short-lived tasks periodically, and each one handles a small number of
[Druid segments](../design/segments.html). Tranquility coordinates all task
creation through ZooKeeper. You can start up as many Tranquility instances as you like with the same
configuration, even on different machines, and they will send to the same set of tasks.
See the [Tranquility overview](https://github.com/druid-io/tranquility/blob/master/docs/overview.md)
for more details about how Tranquility manages tasks.
### segmentGranularity and windowPeriod
The segmentGranularity is the time period covered by the segments produced by each task. For
example, a segmentGranularity of "hour" will spawn tasks that create segments covering one hour
each.
The windowPeriod is the slack time permitted for events. For example, a windowPeriod of ten minutes
(the default) means that any events with a timestamp older than ten minutes in the past, or more
than ten minutes in the future, will be dropped.
These are important configurations because they influence how long tasks will be alive for, and how
long data stays in the realtime system before being handed off to the historical nodes. For example,
if your configuration has segmentGranularity "hour" and windowPeriod ten minutes, tasks will stay
around listening for events for an hour and ten minutes. For this reason, to prevent excessive
buildup of tasks, it is recommended that your windowPeriod be less than your segmentGranularity.
### Append only
Druid streaming ingestion is *append-only*, meaning you cannot use streaming ingestion to update or
delete individual records after they are inserted. If you need to update or delete individual
records, you need to use a batch reindexing process. See the *[batch ingest](batch-ingestion.html)*
page for more details.
Druid does support efficient deletion of entire time ranges without resorting to batch reindexing.
This can be done automatically through setting up retention policies.
### Guarantees
Tranquility operates under a best-effort design. It tries reasonably hard to preserve your data, by allowing you to set
up replicas and by retrying failed pushes for a period of time, but it does not guarantee that your events will be
processed exactly once. In some conditions, it can drop or duplicate events:
- Events with timestamps outside your configured windowPeriod will be dropped.
- If you suffer more Druid Middle Manager failures than your configured replicas count, some
partially indexed data may be lost.
- If there is a persistent issue that prevents communication with the Druid indexing service, and
retry policies are exhausted during that period, or the period lasts longer than your windowPeriod,
some events will be dropped.
- If there is an issue that prevents Tranquility from receiving an acknowledgement from the indexing
service, it will retry the batch, which can lead to duplicated events.
- If you are using Tranquility inside Storm or Samza, various parts of both architectures have an
at-least-once design and can lead to duplicated events.
Under normal operation, these risks are minimal. But if you need absolute 100% fidelity for
historical data, we recommend a [hybrid batch/streaming](../tutorials/ingestion.html#hybrid-batch-streaming)
architecture.
......@@ -9,6 +9,10 @@ There are several different types of tasks.
Segment Creation Tasks
----------------------
### Hadoop Index Task
See [batch ingestion](../ingestion/batch-ingestion.html).
### Index Task
The Index Task is a simpler variation of the Index Hadoop task that is designed to be used for smaller data sets. The task executes within the indexing service and does not require an external Hadoop setup to use. The grammar of the index task is as follows:
......@@ -123,155 +127,6 @@ The indexSpec is optional and default parameters will be used if not specified.
|dimensionCompression|compression format for dimension columns (currently only affects single-value dimensions, multi-value dimensions are always uncompressed)|`"uncompressed"`, `"lz4"`, `"lzf"`|`"lz4"`|no|
|metricCompression|compression format for metric columns, defaults to LZ4|`"lz4"`, `"lzf"`|`"lz4"`|no|
### Hadoop Index Task
The Hadoop Index Task is used to index larger data sets that require the parallelization and processing power of a Hadoop cluster.
```
{
"type" : "index_hadoop",
"spec": <Hadoop index spec>
}
```
|property|description|required?|
|--------|-----------|---------|
|type|The task type, this should always be "index_hadoop".|yes|
|spec|A Hadoop Index Spec. See [Batch Ingestion](../ingestion/batch-ingestion.html)|yes|
|hadoopDependencyCoordinates|A JSON array of Hadoop dependency coordinates that Druid will use, this property will override the default Hadoop coordinates. Once specified, Druid will look for those Hadoop dependencies from the location specified by `druid.extensions.hadoopDependenciesDir`|no|
|classpathPrefix|Classpath that will be pre-appended for the peon process.|no|
The Hadoop Index Config submitted as part of an Hadoop Index Task is identical to the Hadoop Index Config used by the `HadoopDruidIndexer` except that three fields must be omitted: `segmentOutputPath`, `workingPath`, `metadataUpdateSpec`. The Indexing Service takes care of setting these fields internally.
Note: Before using Hadoop Index Task, please make sure to include Hadoop dependencies so that Druid knows where to pick them up during runtime, see [Include Hadoop Dependencies](../operations/other-hadoop.html).
Druid uses hadoop-client 2.3.0 as the default Hadoop version, you can get it from the released Druid tarball(under folder ```hadoop_dependencies```) or use [pull-deps](../pull-deps.html).
#### Using your own Hadoop distribution
Druid is compiled against Apache hadoop-client 2.3.0. However, if you happen to use a different flavor of Hadoop that is API compatible with hadoop-client 2.3.0, you should first make sure Druid knows where to pick it up, then you should only have to change the `hadoopDependencyCoordinates` property to point to the list of maven artifact used by your distribution. For non-API compatible versions and more information, please see [here](../operations/other-hadoop.html).
#### Resolving dependency conflicts running HadoopIndexTask
Currently, the HadoopIndexTask creates a single classpath to run the HadoopDruidIndexerJob, which can lead to version conflicts between various dependencies of Druid, extension modules, and Hadoop's own dependencies.
The Hadoop index task will put Druid's dependencies first on the classpath, followed by any extensions dependencies, and any Hadoop dependencies last.
If you are having trouble with any extensions in HadoopIndexTask, it may be the case that Druid, or one of its dependencies, depends on a different version of a library than what you are using as part of your extensions, but Druid's version overrides the one in your extension. In that case you probably want to build your own Druid version and override the offending library by adding an explicit dependency to the pom.xml of each druid sub-module that depends on it.
### Realtime Index Task
The indexing service can also run real-time tasks. These tasks effectively transform a middle manager into a real-time node. We introduced real-time tasks as a way to programmatically add new real-time data sources without needing to manually add nodes. We recommend you use the library [tranquility](https://github.com/druid-io/tranquility) to programmatically manage generating real-time index tasks. The grammar for the real-time task is as follows:
```json
{
"type": "index_realtime",
"id": "example",
"resource": {
"availabilityGroup": "someGroup",
"requiredCapacity": 1
},
"spec": {
"dataSchema": {
"dataSource": "wikipedia",
"parser": {
"type": "string",
"parseSpec": {
"format": "json",
"timestampSpec": {
"column": "timestamp",
"format": "iso"
},
"dimensionsSpec": {
"dimensions": [
"page",
"language",
"user",
"unpatrolled",
"newPage",
"robot",
"anonymous",
"namespace",
"continent",
"country",
"region",
"city"
],
"dimensionExclusions": [
],
"spatialDimensions": [
]
}
}
},
"metricsSpec": [
{
"type": "count",
"name": "count"
},
{
"type": "doubleSum",
"name": "added",
"fieldName": "added"
},
{
"type": "doubleSum",
"name": "deleted",
"fieldName": "deleted"
},
{
"type": "doubleSum",
"name": "delta",
"fieldName": "delta"
}
],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "DAY",
"queryGranularity": "NONE"
}
},
"ioConfig": {
"type": "realtime",
"firehose": {
"type": "kafka-0.8",
"consumerProps": {
"zookeeper.connect": "zk_connect_string",
"zookeeper.connection.timeout.ms" : "15000",
"zookeeper.session.timeout.ms" : "15000",
"zookeeper.sync.time.ms" : "5000",
"group.id": "consumer-group",
"fetch.message.max.bytes" : "1048586",
"auto.offset.reset": "largest",
"auto.commit.enable": "false"
},
"feed": "your_kafka_topic"
}
},
"tuningConfig": {
"type": "realtime",
"maxRowsInMemory": 500000,
"intermediatePersistPeriod": "PT10m",
"windowPeriod": "PT10m",
"rejectionPolicy": {
"type": "serverTime"
}
}
}
}
```
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|id|String|The ID of the task.|No|
|Resource|JSON object|Used for high availability purposes.|No|
|availabilityGroup|String|An uniqueness identifier for the task. Tasks with the same availability group will always run on different middle managers. Used mainly for replication. |yes|
|requiredCapacity|Integer|How much middle manager capacity this task will take.|yes|
For schema, windowPeriod, segmentGranularity, and other configuration information, see [Realtime Ingestion](../ingestion/realtime-ingestion.html). For firehose configuration, see [Firehose](../ingestion/firehose.html).
Segment Merging Tasks
---------------------
......@@ -328,7 +183,8 @@ The convert task suite takes active segments and will recompress them using a ne
Upon success the new segments will have the same version as the old segment with `_converted` appended. A convert task may be run against the same interval for the same datasource multiple times. Each execution will append another `_converted` to the version for the segments
There are two types of conversion tasks. One is the Hadoop convert task, and the other is the indexing service convert task. The Hadoop convert task runs on a hadoop cluster, and simply leaves a task monitor on the indexing service (similar to the hadoop batch task). The indexing service convert task runs the actual conversion on the indexing service.
####Hadoop Convert Segment Task
#### Hadoop Convert Segment Task
```json
{
"type": "hadoop_convert_segment",
......@@ -358,7 +214,7 @@ The values are described below.
|`segmentOutputPath`|URI|A base uri for the segment to be placed. Same format as other places a segment output path is needed|Yes|
####Indexing Service Convert Segment Task
#### Indexing Service Convert Segment Task
```json
{
"type": "convert_segment",
......@@ -380,6 +236,7 @@ The values are described below.
|`validate`|boolean|Runs validation between the old and new segment before reporting task success|No (true)|
Unlike the hadoop convert task, the indexing service task draws its output path from the indexing service's configuration.
### Noop Task
These tasks start, sleep for a time and are used only for testing. The available grammar is:
......@@ -396,4 +253,8 @@ These tasks start, sleep for a time and are used only for testing. The available
Locking
-------
Once an overlord node accepts a task, a lock is created for the data source and interval specified in the task. Tasks do not need to explicitly release locks, they are released upon task completion. Tasks may potentially release locks early if they desire. Tasks ids are unique by naming them using UUIDs or the timestamp in which the task was created. Tasks are also part of a "task group", which is a set of tasks that can share interval locks.
Once an overlord node accepts a task, a lock is created for the data source and interval specified in the task.
Tasks do not need to explicitly release locks, they are released upon task completion. Tasks may potentially release
locks early if they desire. Tasks ids are unique by naming them using UUIDs or the timestamp in which the task was created.
Tasks are also part of a "task group", which is a set of tasks that can share interval locks.
......@@ -109,7 +109,7 @@ For example:
### Reindexing without Hadoop Batch Ingestion
This section assumes the reader understands how to do batch ingestion without Hadoop using the [IndexTask](../misc/tasks.html#index-task),
This section assumes the reader understands how to do batch ingestion without Hadoop using the [IndexTask](../ingestion/tasks.html#index-task),
which uses a "firehose" to know where and how to read the input data. [IngestSegmentFirehose](firehose.html#ingestsegmentfirehose)
can be used to read data from segments inside Druid. Note that IndexTask is to be used for prototyping purposes only as
it has to do all processing inside a single process and can't scale. Please use Hadoop batch ingestion for production
......
---
layout: doc_page
---
# Setting Up a Druid Cluster
A Druid cluster consists of various node types that need to be set up depending on your use case. See our [Design](../design/design.html) docs for a description of the different node types.
Minimum Physical Layout: Absolute Minimum
-----------------------------------------
As a special case, the absolute minimum setup is one of the standalone examples for real-time ingestion and querying; see [Examples](../tutorials/examples.html) that can easily run on one machine with one core and 1GB RAM. This layout can be set up to try some basic queries with Druid.
Minimum Physical Layout: Experimental Testing with 4GB of RAM
-------------------------------------------------------------
This layout can be used to load some data from deep storage onto a Druid historical node for the first time. A minimal physical layout for a 1 or 2 core machine with 4GB of RAM is:
1. node1: [Coordinator](../design/coordinator.html) + metadata service + zookeeper + [Historical](../design/historical.html)
2. transient nodes: [Indexing Service](../design/indexing-service.html)
This setup is only reasonable to prove that a configuration works. It would not be worthwhile to use this layout for performance measurement.
Comfortable Physical Layout: Pilot Project with Multiple Machines
-----------------------------------------------------------------
The machine size "flavors" are using AWS/EC2 terminology for descriptive purposes only and is not meant to imply that AWS/EC2 is required or recommended. Another cloud provider or your own hardware can also work.
A minimal physical layout not constrained by cores that demonstrates parallel querying and realtime, using AWS-EC2 "small"/m1.small (one core, with 1.7GB of RAM) or larger, no real-time, is:
1. node1: [Coordinator](../design/coordinator.html) (m1.small)
2. node2: metadata service (m1.small)
3. node3: zookeeper (m1.small)
4. node4: [Broker](../design/broker.html) (m1.small or m1.medium or m1.large)
5. node5: [Historical](../design/historical.html) (m1.small or m1.medium or m1.large)
6. node6: [Historical](../design/historical.html) (m1.small or m1.medium or m1.large)
7. node7: [Realtime](../design/realtime.html) (m1.small or m1.medium or m1.large)
8. transient nodes: [Indexing Service](../design/indexing-service.html)
This layout naturally lends itself to adding more RAM and core to Historical nodes, and to adding many more Historical nodes. Depending on the actual load, the Coordinator, metadata server, and Zookeeper might need to use larger machines.
High Availability Physical Layout
---------------------------------
The machine size "flavors" are using AWS/EC2 terminology for descriptive purposes only and is not meant to imply that AWS/EC2 is required or recommended. Another cloud provider or your own hardware can also work.
An HA layout allows full rolling restarts and heavy volume:
1. node1: [Coordinator](../design/coordinator.html) (m1.small or m1.medium or m1.large)
2. node2: [Coordinator](../design/coordinator.html) (m1.small or m1.medium or m1.large) (backup)
3. node3: metadata service (c1.medium or m1.large)
4. node4: metadata service (c1.medium or m1.large) (backup)
5. node5: zookeeper (c1.medium)
6. node6: zookeeper (c1.medium)
7. node7: zookeeper (c1.medium)
8. node8: [Broker](../design/broker.html) (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge)
9. node9: [Broker](../design/broker.html) (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge) (backup)
10. node10: [Historical](../design/historical.html) (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge)
11. node11: [Historical](../design/historical.html) (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge)
12. node12: [Realtime](../design/realtime.html) (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge)
13. transient nodes: [Indexing Service](../design/indexing-service.html)
Sizing for Cores and RAM
------------------------
The Historical and Broker nodes will use as many cores as are available, depending on usage, so it is best to keep these on dedicated machines. The upper limit of effectively utilized cores is not well characterized yet and would depend on types of queries, query load, and the schema. Historical daemons should have a heap a size of at least 1GB per core for normal usage, but could be squeezed into a smaller heap for testing. Since in-memory caching is essential for good performance, even more RAM is better. Broker nodes will use RAM for caching, so they do more than just route queries.
The effective utilization of cores by Zookeeper, metadata storage, and Coordinator nodes is likely to be between 1 and 2 for each process/daemon, so these could potentially share a machine with lots of cores. These daemons work with heap a size between 500MB and 1GB.
Storage
-------
Indexed segments should be kept in a permanent store accessible by all nodes like AWS S3 or HDFS or equivalent. Refer to [Deep-Storage](../dependencies/deep-storage.html) for more details on supported storage types.
Local disk ("ephemeral" on AWS EC2) for caching is recommended over network mounted storage (example of mounted: AWS EBS, Elastic Block Store) in order to avoid network delays during times of heavy usage. If your data center is suitably provisioned for networked storage, perhaps with separate LAN/NICs just for storage, then mounted might work fine.
Setup
-----
Setting up a cluster is essentially just firing up all of the nodes you want with the proper [configuration](../configuration/index.html). One thing to be aware of is that there are a few properties in the configuration that potentially need to be set individually for each process:
```
druid.server.type=historical|realtime
druid.host=someHostOrIPaddrWithPort
druid.port=8080
```
`druid.server.type` should be set to "historical" for your historical nodes and realtime for the realtime nodes. The Coordinator will only assign segments to a "historical" node and the broker has some intelligence around its ability to cache results when talking to a realtime node. This does not need to be set for the coordinator or the broker.
`druid.host` should be set to the hostname that can be used to talk to the given server process. Basically, someone should be able to send a request to http://${druid.host}:${druid.port}/ and actually talk to the process.
`druid.port` should be set to the port that the server should listen on.
Build/Run
---------
The simplest way to build and run from the repository is to run `mvn package` from the base directory and then take `druid-services/target/druid-services-*-selfcontained.jar` and push that around to your machines; the jar does not need to be expanded, and since it contains the main() methods for each kind of service, it is *not* invoked with java -jar. It can be run from a normal java command-line by just including it on the classpath and then giving it the main class that you want to run. For example one instance of the Historical node/service can be started like this:
```
java -Duser.timezone=UTC -Dfile.encoding=UTF-8 -cp services/target/druid-services-*-selfcontained.jar io.druid.cli.Main server historical
```
All Druid server nodes can be started with:
```
io.druid.cli.Main server <node_type>
```
The table below show the program arguments for the different node types.
|service|program arguments|
|-------|----------------|
|Realtime|realtime|
|Coordinator|coordinator|
|Broker|broker|
|Historical|historical|
---
layout: doc_page
---
Evaluate Druid
==============
This page is meant to help you in evaluating Druid by answering common questions that come up.
## Evaluating on a Single Machine
Most of the tutorials focus on running multiple Druid services on a single machine in an attempt to teach basic Druid concepts, and work out kinks in data ingestion. The configurations in the tutorials are
very poor choices for an actual production cluster.
## Capacity and Cost Planning
The best way to understand what your cluster will cost is to first understand how much data reduction you will get when you create segments.
We recommend indexing and creating segments from 1G of your data and evaluating the resultant segment size. This will allow you to see how much your data rolls up, and how many segments will be able
to be loaded on the hardware you have at your disposal.
Most of the cost of a Druid cluster is in historical nodes, followed by real-time indexing nodes if you have a high data intake. For high availability, you should have backup
coordination nodes (coordinators and overlords). Coordination nodes should require much cheaper hardware than nodes that serve queries.
## Selecting Hardware
Druid is designed to run on commodity hardware and we've tried to provide some general guidelines on [how things should be tuned]() for various deployments. We've also provided
some [example specs](../configuration/production-cluster.html) for hardware for a production cluster.
## Benchmarking Druid
The best resource to benchmark Druid is to follow the steps outlined in our [blog post](http://druid.io/blog/2014/03/17/benchmarking-druid.html) about the topic.
The code to reproduce the results in the blog post are all open source. The blog post covers Druid queries on TPC-H data, but you should be able to customize
configuration parameters to your data set. The blog post is a little outdated and uses an older version of Druid, but is still mostly relevant to demonstrate performance.
## Colocating Druid Processes for a POC
Not all Druid node processes need to run on separate machines. You can set up a small cluster with colocated processes to load several gigabytes of data. Please note this cluster is not highly available.
It is recommended you follow the [example production configuration](../configuration/production-cluster.html) for an actual production setup.
The deep storage to use in this POC example can be S3 or HDFS.
* node1: [Coordinator](../design/coordinator.html) + metadata store + zookeeper.
Example hardware: EC2 c3.2xlarge node (8 cores, Intel Xeon E5-2680 v2 @ 2.80GHz and 15GB of RAM).
See [here](../configuration/production-cluster.html) for the runtime.properties. Some example JVM configs for this hardware:
```
-server
-Xmx6g
-Xms6g
-XX:NewSize=512m
-XX:MaxNewSize=512m
-XX:+UseConcMarkSweepGC
```
* node2: [Broker](../design/broker.html)
Example hardware: EC2 c3.2xlarge node (8 cores, Intel Xeon E5-2680 v2 @ 2.80GHz and 15GB of RAM).
[Example configs](https://github.com/druid-io/druid-benchmark/tree/master/config) (see broker-* files).
* node3: [Historical](../design/historical.html).
Example hardware: EC2 m3.2xlarge instances (8 cores, Intel Xeon E5-2670 v2 @ 2.50GHz with 160GB SSD and 30GB of RAM)
[Example configs](https://github.com/druid-io/druid-benchmark/tree/master/config) (see compute-* files).
* node4 (optional): [Real-time](../design/realtime.html) node or [Overlord (Indexing Service)](../design/indexing-service.html) (depending on how you choose to ingest data).
Example hardware: EC2 c3.2xlarge node (8 cores, Intel Xeon E5-2680 v2 @ 2.80GHz and 15GB of RAM).
For the real-time node, see [here](../configuration/production-cluster.html) for the runtime.properties. Use with the following JVM configs:
```
-server
-Xmx8g
-Xms8g
-XX:NewSize=1g
-XX:MaxNewSize=1g
-XX:+UseConcMarkSweepGC
```
For small ingest workloads, you can run the overlord in local mode to load your data.
To do so, see [here](../configuration/simple-cluster.md#overlord-node-indexing-service) for the runtime.properties. Use with the following JVM configs:
```
-server
-Xmx2g
-Xms2g
-XX:NewSize=256m
-XX:MaxNewSize=256m
```
The size of the runner javaOpts can be bumped up:
```
druid.indexer.runner.javaOpts="-server -Xmx6g -Xms6g -XX:NewSize=256m -XX:MaxNewSize=256m"
```
The coordination pieces (coordinator, metadata store, ZK) can be colocated on the same node. These processes do not require many resources, even for reasonably large clusters.
You can add more historical nodes if your data doesn't fit on a single machine.
......@@ -19,12 +19,12 @@ To make this work, follow the steps below
Example:
Suppose you specify `druid.extensions.hadoopDependenciesDir=/usr/local/druid/hadoop_dependencies`, and you want to prepare both `hadoop-client` 2.3.0 and 2.4.0 for Druid,
Suppose you specify `druid.extensions.hadoopDependenciesDir=/usr/local/druid/hadoop-dependencies`, and you want to prepare both `hadoop-client` 2.3.0 and 2.4.0 for Druid,
Then you can either use [pull-deps](../pull-deps.html) or manually set up Hadoop dependencies directories such that under ```hadoop_dependencies```, it looks like this,
Then you can either use [pull-deps](../pull-deps.html) or manually set up Hadoop dependencies directories such that under ```hadoop-dependencies```, it looks like this,
```
hadoop_dependencies/
hadoop-dependencies/
└── hadoop-client
├── 2.3.0
│   ├── activation-1.1.jar
......@@ -44,7 +44,7 @@ hadoop_dependencies/
..... lots of jars
```
As you can see, under ```hadoop-client```, there are two sub-directories, each denotes a version of ```hadoop-client```. During runtime, Druid will look for these directories and load appropriate ```hadoop-client``` based on `hadoopDependencyCoordinates` passed to [Hadoop Index Task](../misc/tasks.html).
As you can see, under ```hadoop-client```, there are two sub-directories, each denotes a version of ```hadoop-client```. During runtime, Druid will look for these directories and load appropriate ```hadoop-client``` based on `hadoopDependencyCoordinates` passed to [Hadoop Index Task](../ingestion/tasks.html).
### Append your Hadoop jars to the Druid classpath
......@@ -58,7 +58,7 @@ If you really don't like the way above, and you just want to use one specific Ha
The default version of Hadoop bundled with Druid is 2.3.
To override the default Hadoop version, both the Hadoop Index Task and the standalone Hadoop indexer support the parameter `hadoopDependencyCoordinates`(See [Index Hadoop Task](../misc/tasks.html). You can pass another set of Hadoop coordinates through this parameter (e.g. You can specify coordinates for Hadoop 2.4.0 as `["org.apache.hadoop:hadoop-client:2.4.0"]`), which will overwrite the default Hadoop coordinates Druid uses.
To override the default Hadoop version, both the Hadoop Index Task and the standalone Hadoop indexer support the parameter `hadoopDependencyCoordinates`(See [Index Hadoop Task](../ingestion/tasks.html). You can pass another set of Hadoop coordinates through this parameter (e.g. You can specify coordinates for Hadoop 2.4.0 as `["org.apache.hadoop:hadoop-client:2.4.0"]`), which will overwrite the default Hadoop coordinates Druid uses.
The Hadoop Index Task takes this parameter has part of the task JSON and the standalone Hadoop indexer takes this parameter as a command line argument.
......
......@@ -2,10 +2,14 @@
layout: doc_page
---
# Performance FAQ
## I can't match your benchmarked results
Improper configuration is by far the largest problem we see people trying to deploy Druid. The example configurations listed in the tutorials are designed for a small volume of data where all nodes are on a single machine. The configs are extremely poor for actual production use.
## What should I set my JVM heap?
The size of the JVM heap really depends on the type of Druid node you are running. Below are a few considerations.
[Broker nodes](../design/broker.html) uses the JVM heap mainly to merge results from historicals and real-times. Brokers also use off-heap memory and processing threads for groupBy queries. We recommend 20G-30G of heap here.
......
......@@ -68,8 +68,8 @@ extensions
```
```
tree hadoop_dependencies
hadoop_dependencies/
tree hadoop-dependencies
hadoop-dependencies/
└── hadoop-client
├── 2.3.0
│   ├── activation-1.1.jar
......
......@@ -13,10 +13,6 @@ We recommend using UTC timezone for all your events and across on your nodes, no
SSDs are highly recommended for historical and real-time nodes if you are not running a cluster that is entirely in memory. SSDs can greatly mitigate the time required to page data in and out of memory.
# Provide Columns Names in Lexicographic Order
Although Druid supports schema-less ingestion of dimensions, because of [https://github.com/druid-io/druid/issues/658](https://github.com/druid-io/druid/issues/658), you may sometimes get bigger segments than necessary. To ensure segments are as compact as possible, providing dimension names in lexicographic order is recommended.
# Use Timeseries and TopN Queries Instead of GroupBy Where Possible
Timeseries and TopN queries are much more optimized and significantly faster than groupBy queries for their designed use cases. Issuing multiple topN or timeseries queries from your application can potentially be more efficient than a single groupBy query.
......
......@@ -129,4 +129,4 @@ The interval of a segment will be compared against the specified period. The per
# Permanently Deleting Data
Druid can fully drop data from the cluster, wipe the metadata store entry, and remove the data from deep storage for any segments that are
marked as unused (segments dropped from the cluster via rules are always marked as unused). You can submit a [kill task](../misc/tasks.html) to the [indexing service](../design/indexing-service.html) to do this.
marked as unused (segments dropped from the cluster via rules are always marked as unused). You can submit a [kill task](../ingestion/tasks.html) to the [indexing service](../design/indexing-service.html) to do this.
......@@ -82,6 +82,9 @@ Computes an arbitrary JavaScript function over a set of columns (both metrics an
All JavaScript functions must return numerical values.
JavaScript aggregators are much slower than native Java aggregators and if performance is critical, you should implement
your functionality as a native Java aggregator.
```json
{ "type": "javascript",
"name": "<output_name>",
......
---
layout: doc_page
---
# Query Caching
Druid supports query result caching through an LRU cache. Results are stored on a per segment basis, along with the
parameters of a given query. This allows Druid to return final results based partially on segment results in the cache and partially
on segment results from scanning historical/real-time segments.
Segment results can be stored in a local heap cache or in an external distributed key/value store. Segment query caches
can be enabled at either the Historical and Broker level (it is not recommended to enable caching on both).
## Query caching on Brokers
Enabling caching on the broker can yield faster results than if query caches were enabled on Historicals for small clusters. This is
the recommended setup for smaller production clusters (< 20 servers). Take note that when caching is enabled on the Broker,
results from Historicals are returned on a per segment basis, and Historicals will be able to do any local result merging.
## Query caching on Historicals
Larger production clusters should enable caching only on the Historicals to avoid having to use Brokers to merge all query
results. Enabling caching on the Historicals enables the Historicals do their own local result merging, and puts less strain
on the Brokers.
......@@ -2,6 +2,10 @@
layout: doc_page
---
This document contains additional query optimizations for certain types of queries.
# Multi-value Dimensions
Druid supports "multi-valued" dimensions. See the section on multi-valued columns in [segments](../design/segments.html) for internal representation details. This document describes the behavior of groupBy(topN has similar behavior) queries on multi-valued dimensions when they are used as a dimension being grouped by.
Suppose, you have a dataSource with a segment that contains following rows with a multi-valued dimension called tags.
......
......@@ -5,18 +5,22 @@
h2. Getting Started
* "Concepts":../design/
* "Hello, Druid":../tutorials/tutorial-a-first-look-at-druid.html
* "Tutorials":../tutorials/index.html
* "Evaluate Druid":../misc/evaluate.html
* "Quickstart":../tutorials/quickstart.html
* "Loading Data":../tutorials/ingestion.html
** "Loading from Files":../tutorials/tutorial-batch.html
** "Loading from Streams":../tutorials/tutorial-streams.html
** "Loading from Kafka":../tutorials/tutorial-kafka.html
** "Clustering":../tutorials/cluster.html
h2. Data Ingestion
* "Overview":../ingestion/overview.html
* "Data Formats":../ingestion/data-formats.html
* "Data Schema":../ingestion/index.html
* "Schema Design":../ingestion/schema-design.html
* "Schema Changes":../ingestion/schema-changes.html
* "Realtime Ingestion":../ingestion/realtime-ingestion.html
* "Batch Ingestion":../ingestion/batch-ingestion.html
* "Batch File Ingestion":../ingestion/batch-ingestion.html
* "Stream Ingestion":../ingestion/stream-ingestion.html
** "Stream Push":../ingestion/stream-push.html
** "Stream Pull":../ingestion/stream-pull.html
* "Updating Existing Data":../ingestion/update-existing-data.html
* "FAQ":../ingestion/faq.html
......@@ -29,6 +33,7 @@ h2. Querying
* "Segment Metadata":../querying/segmentmetadataquery.html
* "DataSource Metadata":../querying/datasourcemetadataquery.html
* "Search":../querying/searchquery.html
* "Select":../querying/select-query.html
* Components
** "Datasources":../querying/datasource.html
** "Filters":../querying/filters.html
......@@ -39,7 +44,9 @@ h2. Querying
** "Context":../querying/query-context.html
* "SQL":../querying/sql.html
* "Joins":../querying/joins.html
* "Multi-Valued Dimensions":../querying/multi-valued-dimensions.html
* "Optimizations":../querying/optimizations.html
* "Multitenancy":../querying/multitenancy.html
* "Caching":../querying/caching.html
h2. Design
* "Overview":../design/design.html
......@@ -64,7 +71,6 @@ h2. Operations
* "Alerts":../operations/alerts.html
* "Updating the Cluster":../operations/rolling-updates.html
* "Different Hadoop Versions":../operations/other-hadoop.html
* "Multitenancy Considerations":../operations/multitenancy.html
* "Performance FAQ":../operations/performance-faq.html
h2. Configuration
......@@ -75,7 +81,6 @@ h2. Configuration
* "Broker":../configuration/broker.html
* "Realtime":../configuration/realtime.html
* "Configuring Logging":../configuration/logging.html
* "Simple Cluster Configuration":../configuration/simple-cluster.html
* "Production Cluster Configuration":../configuration/production-cluster.html
* "Production Hadoop Configuration":../configuration/hadoop.html
* "Production Zookeeper Configuration":../configuration/zookeeper.html
......@@ -90,11 +95,10 @@ h2. Development
* Experimental Features
** "Overview":../development/experimental.html
** "Geographic Queries":../development/geo.html
** "Select Query":../development/select-query.html
** "Approximate Histograms and Quantiles":../development/approximate-histograms.html
** "Datasketches based Aggregators":../development/datasketches-aggregators.html
** "Router node":../development/router.html
** "New Kafka Firehose":../development/kafka-simple-consumer-firehose.html
** "Datasketches":../development/datasketches-aggregators.html
** "Router":../development/router.html
** "Kafka Simple Consumer Firehose":../development/kafka-simple-consumer-firehose.html
h2. Misc
* "Papers & Talks":../misc/papers-and-talks.html
......
---
layout: doc_page
---
# Booting a Druid Cluster
[Loading Your Data](../tutorials/tutorial-loading-batch-data.html) and [All About Queries](../tutorials/tutorial-all-about-queries.html) contain recipes to boot a small druid cluster on localhost. However, when it's time to run a more realistic setup&mdash;for production or just for testing production&mdash;you'll want to find a way to start the cluster on multiple hosts. This document describes two different ways to do this: manually, or as a cloud service via Apache Whirr.
## Manually Booting a Druid Cluster
You can provision individual servers, loading Druid onto each machine (or building it) and setting the required configuration for each type of node. You'll also have to set up required external dependencies. Then you'll have to start each node. This process is outlined in [Tutorial: The Druid Cluster](../tutorials/tutorial-the-druid-cluster.html).
---
layout: doc_page
---
# Clustering
Druid is designed to be deployed as a scalable, fault-tolerant cluster.
In this document, we'll set up a simple cluster and discuss how it can be further configured to meet
your needs. This simple cluster will feature scalable, fault-tolerant servers for Historicals and MiddleManagers, and a single
coordination server to host the Coordinator and Overlord processes. In production, we recommend deploying Coordinators and Overlords in a fault-tolerant
configuration as well.
## Select hardware
The Coordinator and Overlord processes can be co-located on a single server that is responsible for handling the metadata and coordination needs of your cluster.
The equivalent of an AWS [m3.xlarge](https://aws.amazon.com/ec2/instance-types/#M3) is sufficient for most clusters. This
hardware offers:
- 4 vCPUs
- 15 GB RAM
- 80 GB SSD storage
Historicals and MiddleManagers can be colocated on a single server to handle the actual data in your cluster. These servers benefit greatly from CPU, RAM,
and SSDs. The equivalent of an AWS [r3.2xlarge](https://aws.amazon.com/ec2/instance-types/#r3) is a
good starting point. This hardware offers:
- 8 vCPUs
- 61 GB RAM
- 160 GB SSD storage
Druid Brokers accept queries and farm them out to the rest of the cluster. They also optionally maintain an
in-memory query cache. These servers benefit greatly from CPU and RAM, and can also be deployed on
the equivalent of an AWS [r3.2xlarge](https://aws.amazon.com/ec2/instance-types/#r3). This hardware
offers:
- 8 vCPUs
- 61 GB RAM
- 160 GB SSD storage
You can consider co-locating any open source UIs or query libraries on the same server that the Broker is running on.
Very large clusters should consider selecting larger servers.
## Select OS
We recommend running your favorite Linux distribution. You will also need:
* Java 7 or better
Your OS package manager should be able to help for both Java. If your Ubuntu-based OS
does not have a recent enough version of Java, WebUpd8 offers [packages for those
OSes](http://www.webupd8.org/2012/09/install-oracle-java-8-in-ubuntu-via-ppa.html).
## Download the distribution
First, download and unpack the release archive. It's best to do this on a single machine at first,
since you will be editing the configurations and then copying the modified distribution out to all
of your servers.
```bash
curl -O http://static.druid.io/artifacts/releases/druid-0.9.0-bin.tar.gz
tar -xzf druid-0.9.0-bin.tar.gz
cd druid-0.9.0
```
In this package, you'll find:
* `LICENSE` - the license files.
* `bin/` - scripts related to the [single-machine quickstart](quickstart.md).
* `conf/*` - template configurations for a clustered setup.
* `conf-quickstart/*` - configurations for the [single-machine quickstart](quickstart.md).
* `extensions/*` - all Druid extensions.
* `hadoop-dependencies/*` - Druid Hadoop dependencies.
* `lib/*` - all included software packages for core Druid.
* `quickstart/*` - files related to the [single-machine quickstart](quickstart.md).
We'll be editing the files in `conf/` in order to get things running.
## Configure deep storage
Druid relies on a distributed filesystem or large object (blob) store for data storage. The most
commonly used deep storage implementations are S3 (popular for those on AWS) and HDFS (popular if
you already have a Hadoop deployment).
### S3
In `conf/druid/_common/common.runtime.properties`,
- Set `druid.extensions.loadList=["druid-s3-extensions"]`.
- Comment out the configurations for local storage under "Deep Storage" and "Indexing service logs".
- Uncomment and configure appropriate values in the "For S3" sections of "Deep Storage" and
"Indexing service logs".
After this, you should have made the following changes:
```
druid.extensions.loadList=["druid-s3-extensions"]
#druid.storage.type=local
#druid.storage.storageDirectory=var/druid/segments
druid.storage.type=s3
druid.storage.bucket=your-bucket
druid.storage.baseKey=druid/segments
druid.s3.accessKey=...
druid.s3.secretKey=...
#druid.indexer.logs.type=file
#druid.indexer.logs.directory=var/druid/indexing-logs
druid.indexer.logs.type=s3
druid.indexer.logs.s3Bucket=your-bucket
druid.indexer.logs.s3Prefix=druid/indexing-logs
```
### HDFS
In `conf/druid/_common/common.runtime.properties`,
- Set `druid.extensions.loadList=["io.druid.extensions:druid-hdfs-storage"]`.
- Comment out the configurations for local storage under "Deep Storage" and "Indexing service logs".
- Uncomment and configure appropriate values in the "For HDFS" sections of "Deep Storage" and
"Indexing service logs".
After this, you should have made the following changes:
```
druid.extensions.loadList=["druid-hdfs-storage"]
#druid.storage.type=local
#druid.storage.storageDirectory=var/druid/segments
druid.storage.type=hdfs
druid.storage.storageDirectory=/druid/segments
#druid.indexer.logs.type=file
#druid.indexer.logs.directory=var/druid/indexing-logs
druid.indexer.logs.type=hdfs
druid.indexer.logs.directory=/druid/indexing-logs
```
Also,
- Place your Hadoop configuration XMLs (core-site.xml, hdfs-site.xml, yarn-site.xml,
mapred-site.xml) on the classpath of your Druid nodes. You can do this by copying them into
`conf/druid/_common/`.
## Configure Tranquility Server (optional)
Data streams can be sent to Druid through a simple HTTP API powered by Tranquility
Server. If you will be using this functionality, then at this point you should [configure
Tranquility Server](../ingestion/stream-ingestion.html#server).
## Configure Tranquility Kafka (optional)
Druid can consuming streams from Kafka through Tranquility Kafka. If you will be
using this functionality, then at this point you should
[configure Tranquility Kafka](../ingestion/stream-ingestion.html#kafka).
## Configure for connecting to Hadoop (optional)
If you will be loading data from a Hadoop cluster, then at this point you should configure Druid to be aware
of your cluster:
- Update `druid.indexer.task.hadoopWorkingPath` in `conf/middleManager/runtime.properties` to
a path on HDFS that you'd like to use for temporary files required during the indexing process.
`druid.indexer.task.hadoopWorkingPath=/tmp/druid-indexing` is a common choice.
- Place your Hadoop configuration XMLs (core-site.xml, hdfs-site.xml, yarn-site.xml,
mapred-site.xml) on the classpath of your Druid nodes. You can do this by copying them into
`conf/druid/_common/core-site.xml`, `conf/druid/_common/hdfs-site.xml`, and so on.
Note that you don't need to use HDFS deep storage in order to load data from Hadoop. For example, if
your cluster is running on Amazon Web Services, we recommend using S3 for deep storage even if you
are loading data using Hadoop or Elastic MapReduce.
For more info, please see [batch ingestion](../ingestion/batch-ingestion.html).
## Configure addresses for Druid coordination
In this simple cluster, you will deploy a single Druid Coordinator, a
single Druid Overlord, a single ZooKeeper instance, and an embedded Derby metadata store on the same server.
In `conf/druid/_common/common.runtime.properties`, replace
"zk.host.ip" with the IP address of the machine that runs your ZK instance:
- `druid.zk.service.host`
In `conf/_common/common.runtime.properties`, replace
"metadata.store.ip" with the IP address of the machine that you will use as your metadata store:
- `druid.metadata.storage.connector.connectURI`
- `druid.metadata.storage.connector.host`
```note-caution
In production, we recommend running 2 servers, each running a Druid Coordinator
and a Druid Overlord. We also recommend running a ZooKeeper cluster on its own dedicated hardware,
as well as replicated [metadata
storage](http://druid.io/docs/latest/dependencies/metadata-storage.html) such as MySQL or
PostgreSQL, on its own dedicated hardware.
```
## Tune Druid processes that serve queries
Druid Historicals and MiddleManagers can be co-located on the same hardware. Both Druid processes benefit greatly from
being tuned to the hardware they run on. If you are running Tranquility Server or Kafka, you can also colocate Tranquility with these two Druid processes.
If you are using [r3.2xlarge](https://aws.amazon.com/ec2/instance-types/#r3)
EC2 instances, or similar hardware, the configuration in the distribution is a
reasonable starting point.
If you are using different hardware, we recommend adjusting configurations for your specific
hardware. The most commonly adjusted configurations are:
- `-Xmx` and `-Xms`
- `druid.server.http.numThreads`
- `druid.processing.buffer.sizeBytes`
- `druid.processing.numThreads`
- `druid.query.groupBy.maxIntermediateRows`
- `druid.query.groupBy.maxResults`
- `druid.server.maxSize` and `druid.segmentCache.locations` on Historical Nodes
- `druid.worker.capacity` on MiddleManagers
```note
Keep -XX:MaxDirectMemory >= numThreads*sizeBytes, otherwise Druid will fail to start up..
```
Please see the Druid [configuration documentation](../configuration/index.html) for a full description of all
possible configuration options.
## Tune Druid Brokers
Druid Brokers also benefit greatly from being tuned to the hardware they
run on. If you are using [r3.2xlarge](https://aws.amazon.com/ec2/instance-types/#r3) EC2 instances,
or similar hardware, the configuration in the distribution is a reasonable starting point.
If you are using different hardware, we recommend adjusting configurations for your specific
hardware. The most commonly adjusted configurations are:
- `-Xmx` and `-Xms`
- `druid.server.http.numThreads`
- `druid.cache.sizeInBytes`
- `druid.processing.buffer.sizeBytes`
- `druid.processing.numThreads`
- `druid.query.groupBy.maxIntermediateRows`
- `druid.query.groupBy.maxResults`
```note-caution
Keep -XX:MaxDirectMemory >= numThreads*sizeBytes, otherwise Druid will fail to start up..
```
Please see the Druid [configuration documentation](../configuration/index.html) for a full description of all
possible configuration options.
## Start Coordinator, Overlord, Zookeeper, and metadata store
Copy the Druid distribution and your edited configurations to your coordination
server. If you have been editing the configurations on your local machine, you can use *rsync* to
copy them:
```bash
rsync -az druid-0.9.0/ COORDINATION_SERVER:druid-0.9.0/
```
Log on to your coordination server and install Zookeeper:
```bash
curl http://www.gtlib.gatech.edu/pub/apache/zookeeper/zookeeper-3.4.6/zookeeper-3.4.6.tar.gz -o zookeeper-3.4.6.tar.gz
tar -xzf zookeeper-3.4.6.tar.gz
cd zookeeper-3.4.6
cp conf/zoo_sample.cfg conf/zoo.cfg
./bin/zkServer.sh start
```note-caution
In production, we also recommend running a ZooKeeper cluster on its own dedicated hardware.
```
On your coordination server, *cd* into the distribution and start up the coordination services (you should do this in different windows or pipe the log to a file):
```bash
java `cat conf/druid/coordinator/jvm.config | xargs` -cp conf/druid/_common:conf/druid/coordinator:lib/* io.druid.cli.Main server coordinator
java `cat conf/druid/overlord/jvm.config | xargs` -cp conf/druid/_common:conf/druid/overlord:lib/* io.druid.cli.Main server overlord
```
You should see a log message printed out for each service that starts up. You can view detailed logs
for any service by looking in the `var/log/druid` directory using another terminal.
## Start Historicals and MiddleManagers
Copy the Druid distribution and your edited configurations to your servers set aside for the Druid Historicals and MiddleManagers.
On each one, *cd* into the distribution and run this command to start a Data server:
```bash
java `cat conf/druid/historical/jvm.config | xargs` -cp conf/druid/_common:conf/druid/historical:lib/* io.druid.cli.Main server historical
java `cat conf/druid/middleManager/jvm.config | xargs` -cp conf/druid/_common:conf/druid/middleManager:lib/* io.druid.cli.Main server middleManager
```
You can add more servers with Druid Historicals and MiddleManagers as needed.
```note-info
For clusters with complex resource allocation needs, you can break apart Historicals and MiddleManagers and scale the components individually.
This also allows you take advantage of Druid's built-in MiddleManager
autoscaling facility.
```
If you are doing push-based stream ingestion with Kafka or over HTTP, you can also start Tranquility server on the same
hardware that holds MiddleManagers and Historicals. For large scale production, MiddleManagers and Tranquility server
can still be co-located. If you are running Tranquility (not server) with a stream processor, you can co-locate
Tranquility with the stream processor and not require Tranquility server.
```bash
curl -O http://static.druid.io/tranquility/releases/tranquility-distribution-0.7.2.tgz
tar -xzf tranquility-distribution-0.7.2.tgz
cd tranquility-distribution-0.7.2.tgz
bin/tranquility <server or kafka> -configFile <path_to_druid_distro>/conf/tranquility/<server or kafka>.json
```
## Start Druid Broker
Copy the Druid distribution and your edited configurations to your servers set aside for the Druid Brokers.
On each one, *cd* into the distribution and run this command to start a Broker (you want to pipe the output to a log file):
```bash
java `cat conf/druid/broker/jvm.config | xargs` -cp conf/druid/_common:conf/druid/broker:lib/* io.druid.cli.Main server broker
```
You can add more Brokers as needed based on query load.
## Loading data
Congratulations, you now have a Druid cluster! The next step is to learn about recommended ways to load data into
Druid based on your use case. Read more about [loading data](ingestion.html).
---
layout: doc_page
---
Examples
========
The examples on this page are setup in order to give you a feel for what Druid does in practice. They are quick demos of Druid based on [CliRealtimeExample](https://github.com/druid-io/druid/blob/master/services/src/main/java/io/druid/cli/CliRealtimeExample.java). While you wouldn’t run it this way in production you should be able to see how ingestion works and the kind of exploratory queries that are possible. Everything that can be done on your box here can be scaled out to 10’s of billions of events and terabytes of data per day in a production cluster while still giving the snappy responsive exploratory queries.
Installing Standalone Druid
---------------------------
There are two options for installing standalone Druid. Building from source, and downloading the Druid Standalone Kit (DSK).
### Building from source
Clone Druid and build it:
``` bash
git clone https://github.com/druid-io/druid.git druid
cd druid
git fetch --tags
git checkout druid-<version>
mvn clean package
```
### Downloading the DSK (Druid Standalone Kit)
[Download](http://druid.io/downloads.html) a stand-alone tarball and run it:
``` bash
tar -xzf druid-<version>-bin.tar.gz
cd druid-<version>
```
Twitter Example
---------------
For a full tutorial based on the twitter example, check out this [Twitter Tutorial](twitter-tutorial.html).
This Example uses a feature of Twitter that allows for sampling of it’s stream. We sample the Twitter stream via our [TwitterSpritzerFirehoseFactory](https://github.com/druid-io/druid/blob/master/examples/src/main/java/druid/examples/twitter/TwitterSpritzerFirehoseFactory.java) class and use it to simulate the kinds of data you might ingest into Druid. Then, with the client part, the sample shows what kinds of analytics explorations you can do during and after the data is loaded.
### What you’ll learn
* See how large amounts of data gets ingested into Druid in real-time
* Learn how to do fast, interactive, analytics queries on that real-time data
### What you need
* A build of standalone Druid with the Twitter example (see above)
* A Twitter username and password.
### What you’ll do
See [Twitter Tutorial](twitter-tutorial.html)
Rand Example
------------
This uses `RandomFirehoseFactory` which emits a stream of random numbers (outColumn, a positive double) with timestamps along with an associated token (target). This provides a timeseries that requires no network access for demonstration, characterization, and testing. The generated tuples can be thought of as asynchronously produced triples (timestamp, outColumn, target) where the timestamp varies depending on speed of processing.
In a terminal window, (NOTE: If you are using the cloned Github repository these scripts are in ./examples/bin) start the server with:
``` bash
./run_example_server.sh # type rand when prompted
```
In another terminal window:
``` bash
./run_example_client.sh # type rand when prompted
```
The result of the client query is in JSON format. The client makes a REST request using the program `curl` which is usually installed on Linux, Unix, and OSX by default.
---
layout: doc_page
---
What to Do When You Have a Firewall
-----------------------------------
When you are behind a firewall, if the IRC wikipedia channels that feed realtime data into Druid are not accessible, then there is nothing you can do. If IRC channels are accessible, but downloading Geolite DB from maxmind is firewalled, you can workaround this challenge by making GeoLite DB dependency available offline, see below.
## Making the Wikipedia Example GeoLite DB Dependency Available Offline
1. Download GeoLite2 City DB from http://dev.maxmind.com/geoip/geoip2/geolite2/
2. Copy and extract the DB to *`java.io.tmpdir`*`/io.druid.segment.realtime.firehose.WikipediaIrcDecoder.GeoLite2-City.mmdb`; e.g. `/tmp/io.druid.segment.realtime.firehose.WikipediaIrcDecoder.GeoLite2-City.mmdb`
**Note**: depending on the machine's reboot policy, if the `java.io.tmpdir` resolves to the `/tmp` directory, you may have to create this file again in the `tmp` directory after a machine reboot
## Loading the Data into Druid directly from Kafka
As an alternative to reading the data from the IRC channels, which is a challenge to try to do it from behind a firewall, we will use Kafka to stream the data to Druid. To do so, we will need to:
1. Configure the Wikipedia example to read streaming data from Kafka
2. Set up and configure Kafka
#### Wikipedia Example Configuration
1. In your favorite editor, open the file `druid-<version>/examples/wikipedia/wikipedia_realtime.spec`
2. Backup the file, if necessary, then replace the file content with the following:
```json
[
{
"dataSchema": {
"dataSource": "wikipedia",
"parser": {
"type": "string",
"parseSpec": {
"format": "json",
"timestampSpec": {
"column": "timestamp",
"format": "auto"
},
"dimensionsSpec": {
"dimensions": [
"page",
"language",
"user",
"unpatrolled",
"newPage",
"robot",
"anonymous",
"namespace",
"continent",
"country",
"region",
"city"
],
"dimensionExclusions": [],
"spatialDimensions": []
}
}
},
"metricsSpec": [
{
"type": "count",
"name": "count"
},
{
"type": "doubleSum",
"name": "added",
"fieldName": "added"
},
{
"type": "doubleSum",
"name": "deleted",
"fieldName": "deleted"
},
{
"type": "doubleSum",
"name": "delta",
"fieldName": "delta"
}
],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "DAY",
"queryGranularity": "NONE"
}
},
"ioConfig": {
"type": "realtime",
"firehose": {
"type": "kafka-0.8",
"consumerProps": {
"zookeeper.connect": "localhost:2181",
"zookeeper.connection.timeout.ms": "15000",
"zookeeper.session.timeout.ms": "15000",
"zookeeper.sync.time.ms": "5000",
"group.id": "druid-example",
"fetch.message.max.bytes": "1048586",
"auto.offset.reset": "largest",
"auto.commit.enable": "false"
},
"feed": "wikipedia"
},
"plumber": {
"type": "realtime"
}
},
"tuningConfig": {
"type": "realtime",
"maxRowsInMemory": 500000,
"intermediatePersistPeriod": "PT10m",
"windowPeriod": "PT10m",
"basePersistDirectory": "/tmp/realtime/basePersist",
"rejectionPolicy": {
"type": "messageTime"
}
}
}
]
```
3. Refer to the [Running Example Scripts](#running-example-scripts) section to start the example Druid Realtime node by issuing the following from within your Druid directory:
```bash
./run_example_server.sh
```
#### Kafka Setup and Configuration
1. Download Kafka
For this tutorial we will [download Kafka 0.8.2.1]
(https://www.apache.org/dyn/closer.cgi?path=/kafka/0.8.2.1/kafka_2.10-0.8.2.1.tgz)
```bash
tar -xzf kafka_2.10-0.8.2.1.tgz
cd kafka_2.10-0.8.2.1
```
2. Start Kafka
**First, launch ZooKeeper** (refer to the [Set up Zookeeper](#set-up-zookeeper) section for details), then start the Kafka server (in a separate console):
```bash
./bin/kafka-server-start.sh config/server.properties
```
3. Create a topic named `wikipedia`
```bash
./bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic wikipedia
```
4. Launch a console producer for the topic `wikipedia`
```bash
./bin/kafka-console-producer.sh --broker-list localhost:9092 --topic wikipedia
```
5. Copy and paste the following data into the terminal where we launched the Kafka console producer in the previous step:
```json
{"timestamp": "2013-08-31T01:02:33Z", "page": "Gypsy Danger", "language" : "en", "user" : "nuclear", "unpatrolled" : "true", "newPage" : "true", "robot": "false", "anonymous": "false", "namespace":"article", "continent":"North America", "country":"United States", "region":"Bay Area", "city":"San Francisco", "added": 57, "deleted": 200, "delta": -143}
{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Cantebury", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330}
{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Oblast", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111}
{"timestamp": "2013-08-31T11:58:39Z", "page": "Crimson Typhoon", "language" : "zh", "user" : "triplets", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"China", "region":"Shanxi", "city":"Taiyuan", "added": 905, "deleted": 5, "delta": 900}
{"timestamp": "2013-08-31T12:41:27Z", "page": "Coyote Tango", "language" : "ja", "user" : "stringer", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"Japan", "region":"Kanto", "city":"Tokyo", "added": 1, "deleted": 10, "delta": -9}
```
#### Finally
Now, that data has been fed into Druid, refer to the [Running Example Scripts](#running-example-scripts) section to query the real-time node by issuing the following from within the Druid directory:
```bash
./run_example_client.sh
```
The [Querying Druid](../querying/querying.md) section also has further querying examples.
---
layout: doc_page
---
# Druid Tutorials
We have a series of tutorials to help new users learn to use and operate Druid. We will be adding new tutorials to this list periodically and we encourage the community to contribute tutorials of their own.
## Tutorials
* **[A First Look at Druid](../tutorials/tutorial-a-first-look-at-druid.html)**
This tutorial covers a very basic introduction to Druid. You will load some streaming wikipedia data and learn about basic queries.
* **[The Druid Cluster](../tutorials/tutorial-the-druid-cluster.html)**
This tutorial goes over the basic operations of the nodes in a Druid cluster and how to start the nodes.
* **[Loading Streaming Data](../tutorials/tutorial-loading-streaming-data.html)**
This tutorial covers loading streaming data into Druid.
* **[Loading Batch Data](../tutorials/tutorial-loading-batch-data.html)**
This tutorial covers loading static (batch) data into Druid.
---
layout: doc_page
---
# Loading Data
## Choosing an ingestion method
Druid supports streaming (real-time) and file-based (batch) ingestion methods. The most
popular configurations are:
- [Files](batch-ingestion.html) - Load data from HDFS, S3, local files, or any supported Hadoop
filesystem in batches. We recommend this method if your dataset is already in flat files.
- [Stream push](stream-ingestion.html#stream-push) - Push a data stream into Druid in real-time
using [Tranquility](http://github.com/druid-io/tranquility), a client library for sending streams
to Druid. We recommend this method if your dataset originates in a streaming system like Kafka,
Storm, Spark Streaming, or your own system.
- [Stream pull](stream-ingestion.html#stream-pull) - Pull a data stream directly from an external
data source into Druid using Realtime Nodes.
## Getting started
The easiest ways to get started with loading your own data are the three included tutorials.
- [Files-based tutorial](tutorial-batch.html) showing you how to load files from your local disk.
- [Streams-based tutorial](tutorial-streams.html) showing you how to push data over HTTP.
- [Kafka-based tutorial](tutorial-kafka.html) showing you how to load data from Kafka.
## Hybrid batch/streaming
You can combine batch and streaming methods in a hybrid batch/streaming architecture. In a hybrid architecture,
you use a streaming method to do initial ingestion, and then periodically re-ingest older data in batch mode
(typically every few hours, or nightly). When Druid re-ingests data for a time range, the new data automatically
replaces the data from the earlier ingestion.
All streaming ingestion methods currently supported by Druid do introduce the possibility of dropped or duplicated
messages in certain failure scenarios, and batch re-ingestion eliminates this potential source of error for
historical data.
Batch re-ingestion also gives you the option to re-ingest your data if you needed to revise it for any reason.
---
layout: doc_page
---
# Druid Quickstart
In this quickstart, we will download Druid, set up it up on a single machine, load some data, and query the data.
## Prerequisites
You will need:
* Java 7 or higher
* Linux, Mac OS X, or other Unix-like OS (Windows is not supported)
* 8G of RAM
* 2 vCPUs
On Mac OS X, you can use [Oracle's JDK
8](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html) to install
Java.
On Linux, your OS package manager should be able to help for Java. If your Ubuntu-
based OS does not have a recent enough version of Java, WebUpd8 offers [packages for those
OSes](http://www.webupd8.org/2012/09/install-oracle-java-8-in-ubuntu-via-ppa.html).
## Getting started
To install Druid, issue the following commands in your terminal:
```bash
curl -O http://static.druid.io/artifacts/releases/druid-0.9.0-bin.tar.gz
tar -xzf druid-0.9.0-bin.tar.gz
cd druid-0.9.0
```
In the package, you should find:
* `LICENSE` - the license files.
* `bin/` - scripts useful for this quickstart.
* `conf/*` - template configurations for a clustered setup.
* `conf-quickstart/*` - configurations for this quickstart.
* `extensions/*` - all Druid extensions.
* `hadoop-dependencies/*` - Druid Hadoop dependencies.
* `lib/*` - all included software packages for core Druid.
* `quickstart/*` - files useful for this quickstart.
## Start up Zookeeper
Druid currently has a dependency on [Apache ZooKeeper](http://zookeeper.apache.org/) for distributed coordination. You'll
need to download and run Zookeeper.
```bash
curl http://www.gtlib.gatech.edu/pub/apache/zookeeper/zookeeper-3.4.6/zookeeper-3.4.6.tar.gz -o zookeeper-3.4.6.tar.gz
tar -xzf zookeeper-3.4.6.tar.gz
cd zookeeper-3.4.6
cp conf/zoo_sample.cfg conf/zoo.cfg
./bin/zkServer.sh start
```
## Start up Druid services
With Zookeeper running, return to the druid-0.9.0 directory. In that directory, issue the command:
```bash
bin/init
```
Next, you can start up the Druid processes in different terminal windows. This tutorial runs every Druid process on the same system. In production,
many of these Druid processes can be colocated even in a distributed cluster.
```bash
java `cat conf-quickstart/druid/historical/jvm.config | xargs` -cp conf-quickstart/druid/_common:conf-quickstart/druid/historical:lib/* io.druid.cli.Main server historical
java `cat conf-quickstart/druid/broker/jvm.config | xargs` -cp conf-quickstart/druid/_common:conf-quickstart/druid/broker:lib/* io.druid.cli.Main server broker
java `cat conf-quickstart/druid/coordinator/jvm.config | xargs` -cp conf-quickstart/druid/_common:conf-quickstart/druid/coordinator:lib/* io.druid.cli.Main server coordinator
java `cat conf-quickstart/druid/overlord/jvm.config | xargs` -cp conf-quickstart/druid/_common:conf-quickstart/druid/overlord:lib/* io.druid.cli.Main server overlord
java `cat conf-quickstart/druid/middleManager/jvm.config | xargs` -cp conf-quickstart/druid/_common:conf-quickstart/druid/middleManager:lib/* io.druid.cli.Main server middleManager
```
You should see a log message printed out for each service that starts up.
Later on, if you'd like to stop the services, CTRL-C to exit from the running java processes. If you
want a clean start after stopping the services, delete the `var` directory and run the `init` script again.
Once every service has started, you are now ready to load data.
## Load batch data
We've included a sample of Wikipedia edits from September 12, 2015 to get you started.
```note-info
This section shows you how to load data in batches, but you can skip ahead to learn how to [load
streams in real-time](quickstart.html#load-streaming-data). Druid's streaming ingestion can load data
with virtually no delay between events occurring and being available for queries.
```
The [dimensions](https://en.wikipedia.org/wiki/Dimension_%28data_warehouse%29) (attributes you can
filter and split on) in the Wikipedia dataset, other than time, are:
* channel
* cityName
* comment
* countryIsoCode
* countryName
* isAnonymous
* isMinor
* isNew
* isRobot
* isUnpatrolled
* metroCode
* namespace
* page
* regionIsoCode
* regionName
* user
The [measures](https://en.wikipedia.org/wiki/Measure_%28data_warehouse%29), or *metrics* as they are known in Druid (values you can aggregate)
in the Wikipedia dataset are:
* count
* added
* deleted
* delta
* user_unique
To load this data into Druid, you can submit an *ingestion task* pointing to the file. We've included
a task that loads the `wikiticker-2015-09-12-sampled.json` file included in the archive. To submit
this task, POST it to Druid in a new terminal window from the druid-0.9.0 directory:
```bash
curl -X 'POST' -H 'Content-Type:application/json' -d @quickstart/wikiticker-index.json localhost:8090/druid/indexer/v1/task
```
Which will print the ID of the task if the submission was successful:
```base
{"task":"index_hadoop_wikipedia_2013-10-09T21:30:32.802Z"}
```
To view the status of your ingestion task, go to your overlord console:
[http://localhost:8090/console.html](http://localhost:8090/console.html). You can refresh the console periodically, and after
the task is successful, you should see a "SUCCESS" status for the task.
After your ingestion task finishes, the data will be loaded by historical nodes and available for
querying within a minute or two. You can monitor the progress of loading your data in the
coordinator console, by checking whether there is a datasource "wikiticker" with a blue circle
indicating "fully available": [http://localhost:8081/#/](http://localhost:8081/#/).
Once the data is fully available, you can immediately query it&mdash; to see how, skip to the [Query
data](#query-data) section below. Or, continue to the [Load your own data](#load-your-own-data)
section if you'd like to load a different dataset.
## Load streaming data
To load streaming data, we are going to push events into Druid
over a simple HTTP API. We will do this use [Tranquility], a high level data producer
library for Druid.
To download Tranquility, issue the following commands in your terminal:
```bash
curl -O http://static.druid.io/tranquility/releases/tranquility-distribution-0.7.2.tgz
tar -xzf tranquility-distribution-0.7.2.tgz
cd tranquility-distribution-0.7.2
```
We've included a configuration file in `conf-quickstart/tranquility/server.json` as part of the Druid distribution
for a *metrics* datasource. We're going to start the Tranquility server process, which can be used to push events
directly to Druid.
``` bash
bin/tranquility server -configFile <path_to_druid_distro>/conf-quickstart/tranquility/server.json
```
```note-info
This section shows you how to load data using Tranquility Server, but Druid also supports a wide
variety of [other streaming ingestion options](ingestion-streams.html#stream-push), including from
popular streaming systems like Kafka, Storm, Samza, and Spark Streaming.
```
The [dimensions](https://en.wikipedia.org/wiki/Dimension_%28data_warehouse%29) (attributes you can
filter and split on) for this datasource are flexible. It's configured for *schemaless dimensions*,
meaning it will accept any field in your JSON input as a dimension.
The metrics (also called
[measures](https://en.wikipedia.org/wiki/Measure_%28data_warehouse%29); values
you can aggregate) in this datasource are:
* count
* value_sum (derived from `value` in the input)
* value_min (derived from `value` in the input)
* value_max (derived from `value` in the input)
We've included a script that can generate some random sample metrics to load into this datasource.
To use it, simply run in your Druid distribution repository:
```bash
bin/generate-example-metrics | curl -XPOST -H'Content-Type: application/json' --data-binary @- http://localhost:8200/v1/post/metrics
```
Which will print something like:
```
{"result":{"received":25,"sent":25}}
```
This indicates that the HTTP server received 25 events from you, and sent 25 to Druid. Note that
this may take a few seconds to finish the first time you run it, as Druid resources must be
allocated to the ingestion task. Subsequent POSTs should complete quickly.
Once the data is sent to Druid, you can immediately [query it](#query-data).
## Query data
### Direct Druid queries
Druid supports a rich [family of JSON-based
queries](../querying/querying.html). We've included an example topN query
in `quickstart/wikiticker-top-pages.json` that will find the most-edited articles in this dataset:
```bash
curl -L -H'Content-Type: application/json' -XPOST --data-binary @quickstart/wikiticker-top-pages.json http://localhost:8082/druid/v2/?pretty
```
## Visualizing data
Druid is ideal for power user-facing analytic applications. There are a number of different open source applications to
visualize and explore data in Druid. We recommend trying [Pivot](https://github.com/implydata/pivot),
[Panoramix](https://github.com/mistercrunch/panoramix), or [Metabase](https://github.com/metabase/metabase) to start
visualizing the data you just ingested.
If you installed Pivot for example, you should be able to view your data in your browser at [localhost:9090](localhost:9090).
### SQL and other query libraries
There are many more query tools for Druid than we've included here, including SQL
engines, and libraries for various languages like Python and Ruby. Please see [the list of
libraries](../development/libraries.html) for more information.
## Clustered setup
This quickstart sets you up with all services running on a single machine. The next step is to [load
your own data](ingestion.html). Or, you can skip ahead to [running a distributed cluster](cluster.html).
---
layout: doc_page
---
# Tutorial: A First Look at Druid
Greetings! This tutorial will help clarify some core Druid concepts. We will use a real-time dataset and issue some basic Druid queries. If you are ready to explore Druid, and learn a thing or two, read on!
Note: If you are behind a corporate firewall, please see our additional [instructions](../tutorials/firewall.html) for running this tutorial.
About the data
--------------
The data source we'll be working with is Wikipedia edits. Each time an edit is made in Wikipedia, an event gets pushed to an IRC channel associated with the language of the Wikipedia page. We scrape IRC channels for several different languages and load this data into Druid.
Each event has a timestamp indicating the time of the edit (in UTC time), a list of dimensions indicating various metadata about the event (such as information about the user editing the page and whether the user is a bot), and a list of metrics associated with the event (such as the number of characters added and deleted).
Specifically. the data schema looks like this:
Dimensions (things to filter on):
```json
"page"
"language"
"user"
"unpatrolled"
"newPage"
"robot"
"anonymous"
"namespace"
"continent"
"country"
"region"
"city"
```
Metrics (things to aggregate over):
```json
"count"
"added"
"delta"
"deleted"
```
Setting Up
----------
To start, we need to get our hands on a Druid build. There are two ways to get Druid: download a tarball, or [Build From Source](../development/build.html). You only need to do one of these.
### Download a Tarball
We've built a tarball that contains everything you'll need. You'll find it [here](http://druid.io/downloads.html). Download this file to a directory of your choosing.
### Build From Source
Follow the [Build From Source](../development/build.html) guide to build from source. Then grab the tarball from distribution/target/druid-<version>-bin.tar.gz.
### Unpack the Tarball
You can extract the content within by issuing:
```
tar -zxvf druid-<version>-bin.tar.gz
```
If you cd into the directory:
```
cd druid-<version>
```
You should see a bunch of files:
* run_example_server.sh
* run_example_client.sh
* LICENSE, config, examples, lib directories
* extensions (This folder contains all the extensions that could be loaded by Druid. Note that extension `mysql-metadata-storage` is packaged in a separate tarball that can be downloaded from [here](http://druid.io/downloads.html). See [Including Extensions](../operations/including-extensions.html) for more information about loading extensions.
* hadoop_dependencies (This folder contains hadoop-client:2.3.0, see [Different Hadoop Versions](../operations/other-hadoop.html) for more information about how Druid picks up Hadoop dependencies)
## External Dependencies
Druid requires 3 external dependencies.
* A "deep storage" that acts as a data repository. This is generally distributed storage like HDFS or S3. For prototyping or experimentation on a single machine, Druid can use the local filesystem.
* A "metadata storage" to hold configuration and metadata information. This is generally a small, shared database like MySQL or PostgreSQL. For prototyping or experimentation on a single machine, Druid can use a local instance of [Apache Derby](http://db.apache.org/derby/).
* [Apache Zookeeper](http://zookeeper.apache.org/) for coordination among different pieces of the cluster.
This tutorial only requires Zookeeper be set up.
#### Set up Zookeeper
* Download zookeeper from [http://www.apache.org/dyn/closer.cgi/zookeeper/](http://www.apache.org/dyn/closer.cgi/zookeeper/).
* Install zookeeper.
```bash
ZOOKEPER_VERSION=zookeeper-3.4.6
curl http://www.gtlib.gatech.edu/pub/apache/zookeeper/$ZOOKEPER_VERSION/$ZOOKEPER_VERSION.tar.gz -o $ZOOKEPER_VERSION.tar.gz
tar xzf $ZOOKEPER_VERSION.tar.gz
cd $ZOOKEPER_VERSION
cp conf/zoo_sample.cfg conf/zoo.cfg
./bin/zkServer.sh start
cd ..
```
Note you might update zookeper version. (see [here](http://www.gtlib.gatech.edu/pub/apache/zookeeper/)).
Running Example Scripts
-----------------------
Let's start by spinning up a server. You can start an example Druid [Realtime](../design/realtime.html) node by issuing:
```
./run_example_server.sh
```
Select the "wikipedia" example.
Note that the first time you start the example, it may take some extra time due to its fetching various dependencies. Once the node starts up you will see a bunch of logs about setting up properties and connecting to the data source. If everything was successful, you should see messages of the form shown below.
```
2015-02-17T21:46:36,804 INFO [main] org.eclipse.jetty.server.ServerConnector - Started ServerConnector@79b6cf95{HTTP/1.1}{0.0.0.0:8084}
2015-02-17T21:46:36,804 INFO [main] org.eclipse.jetty.server.Server - Started @9580ms
2015-02-17T21:46:36,862 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - irc connection to server [irc.wikimedia.org] established
2015-02-17T21:46:36,862 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - Joining channel #en.wikipedia
2015-02-17T21:46:36,863 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - Joining channel #fr.wikipedia
2015-02-17T21:46:36,863 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - Joining channel #de.wikipedia
2015-02-17T21:46:36,863 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - Joining channel #ja.wikipedia
2015-02-17T21:46:37,009 INFO [ServerInventoryView-0] io.druid.client.BatchServerInventoryView - Inventory Initialized
```
The Druid real-time node ingests events in an in-memory buffer. Periodically, these events will be persisted to disk. If you are interested in the details of our real-time architecture and why we persist indexes to disk, we suggest you read our [White Paper](http://static.druid.io/docs/druid.pdf).
To query the real-time node you've spun up, you can issue:
```
./run_example_client.sh
```
Select "wikipedia" once again. This script issues [TimeBoundary](../querying/timeboundaryquery.html) to the data we've been ingesting. The query looks like this:
```json
{
"queryType":"timeBoundary",
"dataSource":"wikipedia"
}
```
The **timeBoundary** query is one of the simplest queries you can make in Druid. It gives you the boundaries of the ingested data.
The result looks something like this (when it's prettified):
```json
[ {
"timestamp" : "2013-09-04T21:44:00.000Z",
"result" : {
"minTime" : "2013-09-04T21:44:00.000Z",
"maxTime" : "2013-09-04T21:47:00.000Z"
}
} ]
```
If you are having problems with getting results back, make sure you have [curl](http://curl.haxx.se/) installed. Control+C to break out of the client script.
Querying Druid
--------------
In your favorite editor, create the file:
```
timeseries.json
```
We are going to make a slightly more complicated query, the [TimeseriesQuery](../querying/timeseriesquery.html). Copy and paste the following into the file:
```json
{
"queryType": "timeseries",
"dataSource": "wikipedia",
"intervals": [ "2010-01-01/2020-01-01" ],
"granularity": "all",
"aggregations": [
{"type": "longSum", "fieldName": "count", "name": "edit_count"},
{"type": "doubleSum", "fieldName": "added", "name": "chars_added"}
]
}
```
Our query has now expanded to include a time interval, [Granularities](../querying/granularities.html), and [Aggregations](../querying/aggregations.html). What the query is doing is aggregating a set of metrics over a span of time, and the results are grouped into a single time bucket.
To issue the query and get some results, run the following in your command line:
```
curl -X POST 'http://localhost:8084/druid/v2/?pretty' -H 'content-type: application/json' -d @timeseries.json
```
Once again, you should get a JSON blob of text back with your results, that looks something like this:
```json
[ {
"timestamp" : "2013-09-04T21:44:00.000Z",
"result" : { "chars_added" : 312670.0, "edit_count" : 733 }
} ]
```
If you issue the query again, you should notice your results updating.
Right now all the results you are getting back are being aggregated into a single timestamp bucket. What if we wanted to see our aggregations on a per minute basis?
We can change granularity for the results to "minute". To specify different granularities to bucket our results, we change our query like so:
```json
{
"queryType": "timeseries",
"dataSource": "wikipedia",
"intervals": [ "2010-01-01/2020-01-01" ],
"granularity": "minute",
"aggregations": [
{"type": "longSum", "fieldName": "count", "name": "edit_count"},
{"type": "doubleSum", "fieldName": "added", "name": "chars_added"}
]
}
```
This gives us results like the following:
```json
[
{
"timestamp" : "2013-09-04T21:44:00.000Z",
"result" : { "chars_added" : 30665.0, "edit_count" : 128 }
},
{
"timestamp" : "2013-09-04T21:45:00.000Z",
"result" : { "chars_added" : 122637.0, "edit_count" : 167 }
},
{
"timestamp" : "2013-09-04T21:46:00.000Z",
"result" : { "chars_added" : 78938.0, "edit_count" : 159 }
},
...
]
```
Solving a Problem
-----------------
One of Druid's main powers is to provide answers to problems, so let's pose a problem. What if we wanted to know what the top pages in the US are, ordered by the number of edits over the last few minutes you've been going through this tutorial? To solve this problem, we can use the [TopN](../querying/topnquery.html).
Let's create the file:
```
topn.json
```
and put the following in there:
```json
{
"queryType": "topN",
"dataSource": "wikipedia",
"granularity": "all",
"dimension": "page",
"metric": "edit_count",
"threshold" : 10,
"aggregations": [
{"type": "longSum", "fieldName": "count", "name": "edit_count"}
],
"filter": { "type": "selector", "dimension": "country", "value": "United States" },
"intervals": ["2012-10-01T00:00/2020-01-01T00"]
}
```
Note that our query now includes [Filters](../querying/filters.html). Filters are like `WHERE` clauses in SQL and help narrow down the data that needs to be scanned.
If you issue the query:
```
curl -X POST 'http://localhost:8084/druid/v2/?pretty' -H 'content-type: application/json' -d @topn.json
```
You should see an answer to our question. As an example, some results are shown below:
```json
[
{
"timestamp" : "2013-09-04T21:00:00.000Z",
"result" : [
{ "page" : "RTC_Transit", "edit_count" : 6 },
{ "page" : "List_of_Deadly_Women_episodes", "edit_count" : 4 },
{ "page" : "User_talk:David_Biddulph", "edit_count" : 4 },
...
]
}
]
```
Feel free to tweak other query parameters to answer other questions you may have about the data. Druid also includes more complex query types such as [groupBy queries](../querying/groupbyquery.html). For more information on querying, see this [link](../querying/querying.html).
Next Steps
----------
This tutorial only covered the basic operations of a single Druid node. For production, you'll likely need a full Druid cluster. Check out our next tutorial [The Druid Cluster](../tutorials/tutorial-the-druid-cluster.html) to learn more.
To learn more about loading streaming data, see [Loading Streaming Data](../tutorials/tutorial-loading-streaming-data.html).
To learn more about loading batch data, see [Loading Batch Data](../tutorials/tutorial-loading-batch-data.html).
Additional Information
----------------------
This tutorial is merely showcasing a small fraction of what Druid can do. If you are interested in more information about Druid, including setting up a more sophisticated Druid cluster, read more of the Druid documentation and blogs found on druid.io.
Hopefully you learned a thing or two about Druid real-time ingestion, querying Druid, and how Druid can be used to solve problems. If you have additional questions, feel free to post in our [google groups page](https://groups.google.com/forum/#!forum/druid-user).
---
layout: doc_page
---
# Tutorial: All About Queries
Hello! This tutorial is meant to provide a more in-depth look into Druid queries. The tutorial is somewhat incomplete right now but we hope to add more content to it in the near future.
Setup
-----
Before we start digging into how to query Druid, make sure you've gone through the other tutorials and are comfortable with spinning up a local cluster and loading data into Druid.
#### Booting a Druid Cluster
Let's start up a simple Druid cluster so we can query all the things.
Note: If Zookeeper and metadata storage aren't running, you'll have to start them again as described in [The Druid Cluster](../tutorials/tutorial-the-druid-cluster.html).
To start a Coordinator node:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath config/_common:config/coordinator:lib/* io.druid.cli.Main server coordinator
```
To start a Historical node:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath config/_common:config/historical:lib/* io.druid.cli.Main server historical
```
To start a Broker node:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath config/_common:config/broker:lib/* io.druid.cli.Main server broker
```
Querying Your Data
------------------
Make sure you've completed [Loading Your Data](../tutorials/tutorial-loading-streaming-data.html) so we have some data to query. Having done that, it's time to query our data! For a complete specification of queries, see [Querying](../querying/querying.html).
#### Construct a Query
```json
{
"queryType": "groupBy",
"dataSource": "wikipedia",
"granularity": "all",
"dimensions": [],
"aggregations": [
{"type": "count", "name": "rows"},
{"type": "longSum", "name": "edit_count", "fieldName": "count"},
{"type": "doubleSum", "name": "chars_added", "fieldName": "added"}
],
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
}
```
#### Query That Data
Run the query against your broker:
```bash
curl -X POST "http://localhost:8082/druid/v2/?pretty" -H 'Content-type: application/json' -d @query.body
```
And get:
```json
[ {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : {
"chars_added" : 1545.0,
"edit_count" : 5,
"rows" : 5
}
} ]
```
This result tells us that our query has 5 edits, and we have 5 rows of data as well. In those 5 edits, we have 1545 characters added.
#### What can I query for?
How are we to know what queries we can run? Although [Querying](../querying/querying.html) is a helpful index, to get a handle on querying our data we need to look at our ingestion schema. There are a few particular fields we care about in the ingestion schema. All of these fields should in present in the real-time ingestion schema and the batch ingestion schema.
Datasource:
```json
"dataSource":"wikipedia"
```
Our dataSource tells us the name of the relation/table, or 'source of data'. What we decide to name our data source must match the data source we are going to be querying.
Granularity:
```json
"indexGranularity": "none",
```
Druid will roll up data at ingestion time unless the index/rollup granularity is specified as "none". Your query granularity cannot be lower than your index granularity.
Aggregators:
```json
"aggregators" : [{
"type" : "count",
"name" : "count"
}, {
"type" : "doubleSum",
"name" : "added",
"fieldName" : "added"
}, {
"type" : "doubleSum",
"name" : "deleted",
"fieldName" : "deleted"
}, {
"type" : "doubleSum",
"name" : "delta",
"fieldName" : "delta"
}]
```
The [Aggregations](../querying/aggregations.html) specified at ingestion time correlated directly to the metrics that can be queried.
Dimensions:
```json
"dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"]
```
These specify the dimensions that we can filter our data on. If we added a dimension to our groupBy query, we get:
```json
{
"queryType": "groupBy",
"dataSource": "wikipedia",
"granularity": "all",
"dimensions": ["namespace"],
"aggregations": [
{"type": "longSum", "name": "edit_count", "fieldName": "count"},
{"type": "doubleSum", "name": "chars_added", "fieldName": "added"}
],
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
}
```
Which gets us data grouped over the namespace dimension in return!
```json
[ {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : {
"chars_added" : 180.0,
"edit_count" : 2,
"namespace" : "article"
}
}, {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : {
"chars_added" : 1365.0,
"edit_count" : 3,
"namespace" : "wikipedia"
}
} ]
```
Additionally,, we can also filter our query to narrow down our metric values:
```json
{
"queryType": "groupBy",
"dataSource": "wikipedia",
"granularity": "all",
"filter": { "type": "selector", "dimension": "namespace", "value": "article" },
"aggregations": [
{"type": "longSum", "name": "edit_count", "fieldName": "count"},
{"type": "doubleSum", "name": "chars_added", "fieldName": "added"}
],
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
}
```
Which gets us metrics about only those edits where the namespace is 'article':
```json
[ {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : {
"chars_added" : 180.0,
"edit_count" : 2
}
} ]
```
Check out [Filters](../querying/filters.html) for more information.
What Types of Queries to Use
----------------------------
The types of query you should use depends on your use case. [TimeBoundary queries](../querying/timeboundaryquery.html) are useful to understand the range of your data. [Timeseries queries](../querying/timeseriesquery.html) are useful for aggregates and filters over a time range, and offer significant speed improvements over [GroupBy queries](../querying/groupbyquery.html). To find the top values for a given dimension, [TopN queries](../querying/topnquery.html) should be used over group by queries as well.
## Learn More ##
You can learn more about querying at [Querying](../querying/querying.html)! If you are ready to evaluate Druid more in depth, check out [Booting a production cluster](../tutorials/booting-a-production-cluster.html)!
---
layout: doc_page
---
## Load your own batch data
Before you get started with loading your own batch data, you should have first completed the [quickstart](quickstart.html).
You can easily load any timestamped dataset into Druid. For Druid batch loads, the most important
questions are:
* What should the dataset be called? This is the "dataSource" field of the "dataSchema".
* Where is the dataset located? The file paths belong in the "paths" of the "inputSpec". If you
want to load multiple files, you can provide them as a comma-separated string.
* Which field should be treated as a timestamp? This belongs in the "column" of the "timestampSpec".
* Which fields should be treated as dimensions? This belongs in the "dimensions" of the "dimensionsSpec".
* Which fields should be treated as metrics? This belongs in the "metricsSpec".
* What time ranges (intervals) are being loaded? This belongs in the "intervals" of the "granularitySpec".
```note-info
If your data does not have a natural sense of time, you can tag each row with the current time.
You can also tag all rows with a fixed timestamp, like "2000-01-01T00:00:00.000Z".
```
Let's use this pageviews dataset as an example. Druid supports TSV, CSV, and JSON out of the box.
Note that nested JSON objects are not supported, so if you do use JSON, you should provide a file
containing flattened objects.
```json
{"time": "2015-09-01T00:00:00Z", "url": "/foo/bar", "user": "alice", "latencyMs": 32}
{"time": "2015-09-01T01:00:00Z", "url": "/", "user": "bob", "latencyMs": 11}
{"time": "2015-09-01T01:30:00Z", "url": "/foo/bar", "user": "bob", "latencyMs": 45}
```
Make sure the file has no newline at the end. If you save this to a file called "pageviews.json", then for this dataset:
* Let's call the dataset "pageviews".
* The data is located in "pageviews.json".
* The timestamp is the "time" field.
* Good choices for dimensions are the string fields "url" and "user".
* Good choices for metrics are a count of pageviews, and the sum of "latencyMs". Collecting that
sum when we load the data will allow us to compute an average at query time as well.
* The data covers the time range 2015-09-01 (inclusive) through 2015-09-02 (exclusive).
You can copy the existing `quickstart/wikiticker-index.json` indexing task to a new file:
```bash
cp quickstart/wikiticker-index.json quickstart/pageviews-index.json
```
And modify it by altering these sections:
```json
"dataSource": "pageviews"
```
```json
"inputSpec": {
"type": "static",
"paths": "pageviews.json"
}
```
```json
"timestampSpec": {
"format": "auto",
"column": "time"
}
```
```json
"dimensionsSpec": {
"dimensions": ["url", "user"]
}
```
```json
"metricsSpec": [
{"name": "views", "type": "count"},
{"name": "latencyMs", "type": "doubleSum", "fieldName": "latencyMs"}
]
```
```json
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "day",
"queryGranularity": "none",
"intervals": ["2015-09-01/2015-09-02"]
}
```
Finally, fire off the task and indexing will proceed!
```bash
curl -X 'POST' -H 'Content-Type:application/json' -d @quickstart/pageviews-index.json localhost:8090/druid/indexer/v1/task
```
If anything goes wrong with this task (e.g. it finishes with status FAILED), you can troubleshoot
by visiting the "Task log" on the [overlord console](http://localhost:8090/console.html).
```note-info
Druid supports a wide variety of data formats, ingestion options, and configurations not
discussed here. For a full explanation of all available features, see the ingestion sections of the Druid
documentation.
```
For more information on loading batch data, please see [the batch ingestion documentation](../ingestion/batch-ingestion.html).
---
layout: doc_page
---
# Tutorial: Load from Kafka
## Getting started
This tutorial shows you how to load data from Kafka into Druid.
For this tutorial, we'll assume you've already downloaded Druid and Tranquility as described in
the [single-machine quickstart](quickstart.html) and have it running on your local machine. You
don't need to have loaded any data yet.
```note-info
This tutorial will show you how to load data from Kafka into Druid, but Druid additionally supports
a wide variety of batch and streaming loading methods. See the *[Loading files](../ingestion/batch-ingestion.html)*
and *[Loading streams](../ingestion/stream-ingestion.html)* pages for more information about other options,
including from Hadoop, HTTP, Storm, Samza, Spark Streaming, and your own JVM apps.
```
## Start Kafka
[Apache Kafka](http://kafka.apache.org/) is a high throughput message bus that works well with
Druid. For this tutorial, we will use Kafka 0.9.0.0. To download Kafka, issue the following
commands in your terminal:
```bash
curl -O http://www.us.apache.org/dist/kafka/0.9.0.0/kafka_2.11-0.9.0.0.tgz
tar -xzf kafka_2.11-0.9.0.0.tgz
cd kafka_2.11-0.9.0.0
```
Start a Kafka broker by running the following command in a new terminal:
```bash
./bin/kafka-server-start.sh config/server.properties
```
Run this command to create a Kafka topic called *metrics*, to which we'll send data:
```bash
./bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic metrics
```
## Enable Druid Kafka ingestion
Druid includes configs for [Tranquility Kafka](ingestion-streams.md#kafka) to support loading data from Kafka.
To enable this in the quickstart-based configuration:
- Stop your Tranquility command (CTRL-C) and then start it up again.
## Send example data
Let's launch a console producer for our topic and send some data!
In your Druid directory, generate some metrics by running:
```bash
bin/generate-example-metrics
```
In your Kafka directory, run:
```bash
./bin/kafka-console-producer.sh --broker-list localhost:9092 --topic metrics
```
The *kafka-console-producer* command is now awaiting input. Copy the generated example metrics,
paste them into the *kafka-console-producer* terminal, and press enter. If you like, you can also
paste more messages into the producer, or you can press CTRL-D to exit the console producer.
You can immediately query this data, or you can skip ahead to the
[Loading your own data](#loading-your-own-data) section if you'd like to load your own dataset.
## Querying your data
After sending data, you can immediately query it using any of the
[supported query methods](../querying/querying.html).
## Loading your own data
So far, you've loaded data into Druid from Kafka using an ingestion spec that we've included in the
distribution. Each ingestion spec is designed to work with a particular dataset. You load your own
data types into Imply by writing a custom ingestion spec.
You can write a custom ingestion spec by starting from the bundled configuration in
`conf-quickstart/tranquility/kafka.json` and modifying it for your own needs.
The most important questions are:
* What should the dataset be called? This is the "dataSource" field of the "dataSchema".
* Which field should be treated as a timestamp? This belongs in the "column" of the "timestampSpec".
* Which fields should be treated as dimensions? This belongs in the "dimensions" of the "dimensionsSpec".
* Which fields should be treated as measures? This belongs in the "metricsSpec".
Let's use a small JSON pageviews dataset in the topic *pageviews* as an example, with records like:
```json
{"time": "2000-01-01T00:00:00Z", "url": "/foo/bar", "user": "alice", "latencyMs": 32}
```
First, create the topic:
```bash
./bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic pageviews
```
Next, edit `conf-quickstart/tranquility/kafka.json`:
* Let's call the dataset "pageviews-kafka".
* The timestamp is the "time" field.
* Good choices for dimensions are the string fields "url" and "user".
* Good choices for measures are a count of pageviews, and the sum of "latencyMs". Collecting that
sum when we load the data will allow us to compute an average at query time as well.
You can edit the existing `conf-quickstart/tranquility/kafka.json` file by altering these
sections:
1. Change the key `"metrics-kafka"` under `"dataSources"` to `"pageviews-kafka"`
2. Alter these sections under the new `"pageviews-kafka"` key:
```json
"dataSource": "pageviews-kafka"
```
```json
"timestampSpec": {
"format": "auto",
"column": "time"
}
```
```json
"dimensionsSpec": {
"dimensions": ["url", "user"]
}
```
```json
"metricsSpec": [
{"name": "views", "type": "count"},
{"name": "latencyMs", "type": "doubleSum", "fieldName": "latencyMs"}
]
```
```json
"properties" : {
"task.partitions" : "1",
"task.replicants" : "1",
"topicPattern" : "pageviews"
}
```
Next, start Druid Kafka ingestion:
```bash
bin/tranquility kafka -configFile ../druid-0.9.0-SNAPSHOT/conf-quickstart/tranquility/kafka.json
```
- If your Tranquility server or Kafka is already running, stop it (CTRL-C) and
start it up again.
Finally, send some data to the Kafka topic. Let's start with these messages:
```json
{"time": "2000-01-01T00:00:00Z", "url": "/foo/bar", "user": "alice", "latencyMs": 32}
{"time": "2000-01-01T00:00:00Z", "url": "/", "user": "bob", "latencyMs": 11}
{"time": "2000-01-01T00:00:00Z", "url": "/foo/bar", "user": "bob", "latencyMs": 45}
```
Druid streaming ingestion requires relatively current messages (relative to a slack time controlled by the
[windowPeriod](../ingestion/stream-ingestion.html#segmentgranularity-and-windowperiod) value), so you should
replace `2000-01-01T00:00:00Z` in these messages with the current time in ISO8601 format. You can
get this by running:
```bash
python -c 'import datetime; print(datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"))'
```
Update the timestamps in the JSON above, then copy and paste these messages into this console
producer and press enter:
```bash
./bin/kafka-console-producer.sh --broker-list localhost:9092 --topic pageviews
```
That's it, your data should now be in Druid. You can immediately query it using any of the
[supported query methods](../querying/querying.html).
## Further reading
To read more about loading streams, see our [streaming ingestion documentation](../ingestion/stream-ingestion.html).
---
layout: doc_page
---
# Tutorial: Loading Batch Data
In this tutorial, we will learn about batch ingestion (as opposed to real-time ingestion) and how to create segments using the final piece of the Druid Cluster, the [indexing service](../design/indexing-service.html). The indexing service is a standalone service that accepts [tasks](../misc/tasks.html) in the form of POST requests. The output of most tasks are segments. The indexing service can be used as a single service for both real-time/streaming and batch ingestion.
The Data
--------
The data source we'll be using is Wikipedia edits. The data schema is:
Dimensions (things to filter on):
```json
"page"
"language"
"user"
"unpatrolled"
"newPage"
"robot"
"anonymous"
"namespace"
"continent"
"country"
"region"
"city"
```
Metrics (things to aggregate over):
```json
"count"
"added"
"delta"
"deleted"
```
Batch Ingestion
---------------
For the purposes of this tutorial, we are going to use our very small and simple Wikipedia data set. This data can directly be ingested via other means as shown in the previous [tutorial](../tutorials/tutorial-loading-streaming-data.html).
Our data is located at:
```
examples/indexing/wikipedia_data.json
```
The following events should exist in the file:
```json
{"timestamp": "2013-08-31T01:02:33Z", "page": "Gypsy Danger", "language" : "en", "user" : "nuclear", "unpatrolled" : "true", "newPage" : "true", "robot": "false", "anonymous": "false", "namespace":"article", "continent":"North America", "country":"United States", "region":"Bay Area", "city":"San Francisco", "added": 57, "deleted": 200, "delta": -143}
{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Cantebury", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330}
{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Oblast", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111}
{"timestamp": "2013-08-31T11:58:39Z", "page": "Crimson Typhoon", "language" : "zh", "user" : "triplets", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"China", "region":"Shanxi", "city":"Taiyuan", "added": 905, "deleted": 5, "delta": 900}
{"timestamp": "2013-08-31T12:41:27Z", "page": "Coyote Tango", "language" : "ja", "user" : "stringer", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"Japan", "region":"Kanto", "city":"Tokyo", "added": 1, "deleted": 10, "delta": -9}
```
#### Set Up a Druid Cluster
To index the data, we are going to need the overlord, a historical node, and a coordinator node.
Note: If Zookeeper isn't running, you'll have to start it again as described in [The Druid Cluster](../tutorials/tutorial-the-druid-cluster.html).
To start the Indexing Service:
```bash
java -Xmx2g -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath config/_common:config/overlord:lib/* io.druid.cli.Main server overlord
```
To start the Coordinator Node:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath config/_common:config/coordinator:lib/* io.druid.cli.Main server coordinator
```
To start the Historical Node:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath config/_common:config/historical:lib/* io.druid.cli.Main server historical
```
#### Index the Data
There are two ways we can load the data, depending on the data volume. The simplest method of loading data is to use the [Index Task](../misc/tasks.html). Index tasks can load batch data without any external dependencies. They are however, slow when the data volume exceeds 1G.
#### Index Task
To index the data and build a Druid segment, we are going to need to submit a task to the indexing service. This task should already exist:
```
examples/indexing/wikipedia_index_task.json
```
Open up the file to see the following:
```json
{
"type" : "index",
"spec" : {
"dataSchema" : {
"dataSource" : "wikipedia",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"timestampSpec" : {
"column" : "timestamp",
"format" : "auto"
},
"dimensionsSpec" : {
"dimensions": ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"],
"dimensionExclusions" : [],
"spatialDimensions" : []
}
}
},
"metricsSpec" : [
{
"type" : "count",
"name" : "count"
},
{
"type" : "doubleSum",
"name" : "added",
"fieldName" : "added"
},
{
"type" : "doubleSum",
"name" : "deleted",
"fieldName" : "deleted"
},
{
"type" : "doubleSum",
"name" : "delta",
"fieldName" : "delta"
}
],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "DAY",
"queryGranularity" : "NONE",
"intervals" : [ "2013-08-31/2013-09-01" ]
}
},
"ioConfig" : {
"type" : "index",
"firehose" : {
"type" : "local",
"baseDir" : "examples/indexing/",
"filter" : "wikipedia_data.json"
}
},
"tuningConfig" : {
"type" : "index",
"targetPartitionSize" : 0,
"rowFlushBoundary" : 0
}
}
}
```
Okay, so what is happening here? The "type" field indicates the type of task we plan to run. In this case, it is a simple "index" task. The "parseSpec" indicates how we plan to figure out what the timestamp and dimension columns are. The "granularitySpec" indicates that we are building a daily segment for 2013-08-31 to 2013-09-01 and the minimum queryGranularity will be millisecond (NONE). Next, the "metricsSpec" indicate which fields in our data set we plan to build metric columns for. The "fieldName" corresponds to the metric name in the raw data. The "name" corresponds to what our metric column is actually going to be called in the segment. Finally, we have a local "firehose" that is going to read data from disk. We tell the firehose where our data is located and the types of files we are looking to ingest. In our case, we only have a single data file.
Let's send our task to the indexing service now:
```bash
curl -X 'POST' -H 'Content-Type:application/json' -d @examples/indexing/wikipedia_index_task.json localhost:8090/druid/indexer/v1/task
```
Issuing the request should return a task ID like so:
```bash
curl -X 'POST' -H 'Content-Type:application/json' -d @examples/indexing/wikipedia_index_task.json localhost:8090/druid/indexer/v1/task
{"task":"index_wikipedia_2013-10-09T21:30:32.802Z"}
```
In your indexing service logs, you should see the following:
```bash
2013-10-09 21:41:41,150 INFO [qtp300448720-21] io.druid.indexing.overlord.HeapMemoryTaskStorage - Inserting task index_wikipedia_2013-10-09T21:41:41.147Z with status: TaskStatus{id=index_wikipedia_2013-10-09T21:41:41.147Z, status=RUNNING, duration=-1}
2013-10-09 21:41:41,151 INFO [qtp300448720-21] io.druid.indexing.overlord.TaskLockbox - Created new TaskLockPosse: TaskLockPosse{taskLock=TaskLock{groupId=index_wikipedia_2013-10-09T21:41:41.147Z, dataSource=wikipedia, interval=2013-08-31T00:00:00.000Z/2013-09-01T00:00:00.000Z, version=2013-10-09T21:41:41.151Z}, taskIds=[]}
...
013-10-09 21:41:41,215 INFO [pool-6-thread-1] io.druid.indexing.overlord.ForkingTaskRunner - Logging task index_wikipedia_2013-10-09T21:41:41.147Z_generator_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_0 output to: /tmp/persistent/index_wikipedia_2013-10-09T21:41:41.147Z_generator_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_0/b5099fdb-d6b0-4b81-9053-b2af70336a7e/log
2013-10-09 21:41:45,017 INFO [qtp300448720-22] io.druid.indexing.common.actions.LocalTaskActionClient - Performing action for task[index_wikipedia_2013-10-09T21:41:41.147Z_generator_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_0]: LockListAction{}
````
After a few seconds, the task should complete and you should see in the indexing service logs:
```bash
2013-10-09 21:41:45,765 INFO [pool-6-thread-1] io.druid.indexing.overlord.exec.TaskConsumer - Received SUCCESS status for task: IndexGeneratorTask{id=index_wikipedia_2013-10-09T21:41:41.147Z_generator_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_0, type=index_generator, dataSource=wikipedia, interval=Optional.of(2013-08-31T00:00:00.000Z/2013-09-01T00:00:00.000Z)}
```
Congratulations! The segment has completed building. Once a segment is built, a segment metadata entry is created in your metadata storage table. The coordinator compares what is in the segment metadata table with what is in the cluster. A new entry in the metadata table will cause the coordinator to load the new segment in a minute or so.
You should see the following logs on the coordinator:
```bash
2013-10-09 21:41:54,368 INFO [Coordinator-Exec--0] io.druid.server.coordinator.helper.DruidCoordinatorLogger - [_default_tier] : Assigned 1 segments among 1 servers
2013-10-09 21:41:54,369 INFO [Coordinator-Exec--0] io.druid.server.coordinator.helper.DruidCoordinatorLogger - Load Queues:
2013-10-09 21:41:54,369 INFO [Coordinator-Exec--0] io.druid.server.coordinator.helper.DruidCoordinatorLogger - Server[localhost:8083, historical, _default_tier] has 1 left to load, 0 left to drop, 4,477 bytes queued, 4,477 bytes served.
```
These logs indicate that the coordinator has assigned our new segment to the historical node to download and serve. If you look at the historical node logs, you should see:
```bash
2013-10-09 21:41:54,369 INFO [ZkCoordinator-0] io.druid.server.coordination.ZkCoordinator - Loading segment wikipedia_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_2013-10-09T21:41:41.151Z
2013-10-09 21:41:54,369 INFO [ZkCoordinator-0] io.druid.segment.loading.LocalDataSegmentPuller - Unzipping local file[/tmp/druid/localStorage/wikipedia/2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z/2013-10-09T21:41:41.151Z/0/index.zip] to [/tmp/druid/indexCache/wikipedia/2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z/2013-10-09T21:41:41.151Z/0]
2013-10-09 21:41:54,370 INFO [ZkCoordinator-0] io.druid.utils.CompressionUtils - Unzipping file[/tmp/druid/localStorage/wikipedia/2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z/2013-10-09T21:41:41.151Z/0/index.zip] to [/tmp/druid/indexCache/wikipedia/2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z/2013-10-09T21:41:41.151Z/0]
2013-10-09 21:41:54,380 INFO [ZkCoordinator-0] io.druid.server.coordination.SingleDataSegmentAnnouncer - Announcing segment[wikipedia_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_2013-10-09T21:41:41.151Z] to path[/druid/servedSegments/localhost:8083/wikipedia_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_2013-10-09T21:41:41.151Z]
```
Once the segment is announced the segment is queryable. Now you should be able to query the data.
Issuing a [TimeBoundaryQuery](../querying/timeboundaryquery.html) should yield:
```json
[ {
"timestamp" : "2013-08-31T01:02:33.000Z",
"result" : {
"minTime" : "2013-08-31T01:02:33.000Z",
"maxTime" : "2013-08-31T12:41:27.000Z"
}
} ]
```
Console
--------
The indexing service overlord has a console located at:
```bash
localhost:8090/console.html
```
On this console, you can look at statuses and logs of recently submitted and completed tasks.
If you decide to reuse the local firehose to ingest your own data and if you run into problems, you can use the console to read the individual task logs.
Task logs can be stored locally or uploaded to [Deep Storage](../dependencies/deep-storage.html). More information about how to configure this is [here](../configuration/index.html).
Most common data ingestion problems are around timestamp formats and other malformed data issues.
#### Hadoop Index Task
Druid is designed for large data volumes, and most real-world data sets require batch indexing be done through a Hadoop job.
For this tutorial, we used [Hadoop 2.3.0](https://archive.apache.org/dist/hadoop/core/hadoop-2.3.0/), which is included under ```hadoop_dependencies```. There are many pages on the Internet showing how to set up a single-node (standalone) Hadoop cluster, which is all that's needed for this example. For more information about how Druid picks up your Hadoop version, see [here](../operations/other-hadoop.html).
Before indexing the data, make sure you have a valid Hadoop cluster running. To build our Druid segment, we are going to submit a [Hadoop index task](../misc/tasks.html) to the indexing service. The grammar for the Hadoop index task is very similar to the index task of the last tutorial. The tutorial Hadoop index task should be located at:
```
examples/indexing/wikipedia_index_hadoop_task.json
```
Examining the contents of the file, you should find:
```json
{
"type" : "index_hadoop",
"spec" : {
"dataSchema" : {
"dataSource" : "wikipedia",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"timestampSpec" : {
"column" : "timestamp",
"format" : "auto"
},
"dimensionsSpec" : {
"dimensions": ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"],
"dimensionExclusions" : [],
"spatialDimensions" : []
}
}
},
"metricsSpec" : [
{
"type" : "count",
"name" : "count"
},
{
"type" : "doubleSum",
"name" : "added",
"fieldName" : "added"
},
{
"type" : "doubleSum",
"name" : "deleted",
"fieldName" : "deleted"
},
{
"type" : "doubleSum",
"name" : "delta",
"fieldName" : "delta"
}
],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "DAY",
"queryGranularity" : "NONE",
"intervals" : [ "2013-08-31/2013-09-01" ]
}
},
"ioConfig" : {
"type" : "hadoop",
"inputSpec" : {
"type" : "static",
"paths" : "/MyDirectory/examples/indexing/wikipedia_data.json"
}
}
}
}
```
If you are curious about what all this configuration means, see [here](../misc/tasks.html).
To submit the task:
```bash
curl -X 'POST' -H 'Content-Type:application/json' -d @examples/indexing/wikipedia_index_hadoop_task.json localhost:8090/druid/indexer/v1/task
```
After the task is completed, the segment should be assigned to your historical node. You should be able to query the segment.
Next Steps
----------
We demonstrated using the indexing service as a way to ingest data into Druid. Previous versions of Druid used the [HadoopDruidIndexer](../ingestion/batch-ingestion.html) to ingest batch data. The `HadoopDruidIndexer` still remains a valid option for batch ingestion, however, we recommend using the indexing service as the preferred method of getting batch data into Druid.
Additional Information
----------------------
Getting data into Druid can definitely be difficult for first time users. Please don't hesitate to ask questions in our IRC channel or on our [google groups page](https://groups.google.com/forum/#!forum/druid-user).
---
layout: doc_page
---
# Loading Streaming Data
In our [last tutorial](../tutorials/tutorial-the-druid-cluster.html), we set up a
complete Druid cluster. We created all the Druid dependencies and ingested
streaming data. In this tutorial, we will expand upon what we've done in the
first two tutorials.
## About the Data
We will be working with the same Wikipedia edits data schema [from our previous
tutorials](tutorial-a-first-look-at-druid.html#about-the-data).
## Set Up
At this point, you should already have Druid downloaded and be comfortable
running a Druid cluster locally. If not, [have a look at our second
tutorial](../tutorials/tutorial-the-druid-cluster.html). If Zookeeper is not
running, you will have to start it as described in [The Druid
Cluster](../tutorials/tutorial-the-druid-cluster.html).
With real-world data, we recommend having a message bus such as [Apache
Kafka](http://kafka.apache.org/) sit between the data stream and the real-time
node. The message bus provides higher availability for production environments.
[Firehoses](../ingestion/firehose.html) are the key abstraction for real-time ingestion.
### Kafka
Druid communicates with Kafka using the
[KafkaFirehoseFactory](../ingestion/firehose.html). Using this [Firehose](../ingestion/firehose.html)
with the right configuration, we can import data into Druid in real-time
without writing any code. To load data to a real-time node via Kafka, we'll
first need to initialize Zookeeper and Kafka, and then configure and initialize
a [Realtime](../design/realtime.html) node.
The following quick-start instructions for booting a Zookeeper and then Kafka
cluster were adapted from the [Apache Kafka quickstart guide](http://kafka.apache.org/documentation.html#quickstart).
1. Download Kafka
For this tutorial we will [download Kafka 0.8.2.1]
(https://www.apache.org/dyn/closer.cgi?path=/kafka/0.8.2.1/kafka_2.10-0.8.2.1.tgz)
```bash
tar -xzf kafka_2.10-0.8.2.1.tgz
cd kafka_2.10-0.8.2.1
```
1. Start Kafka
First launch ZooKeeper:
```bash
./bin/zookeeper-server-start.sh config/zookeeper.properties
```
Then start the Kafka server (in a separate console):
```bash
./bin/kafka-server-start.sh config/server.properties
```
1. Create a topic named `wikipedia`
```bash
./bin/kafka-topics.sh --create --zookeeper localhost:2181 \
--replication-factor 1 --partitions 1 --topic wikipedia
```
1. Launch a console producer for that topic (so we can paste in kafka
messages in a bit)
```bash
./bin/kafka-console-producer.sh --broker-list localhost:9092 --topic wikipedia
```
### Druid Realtime Node
The realtime spec for the data source in this tutorial is available under
`examples/indexing/wikipedia.spec` from the [Druid
download](http://druid.io/downloads.html)
1. Launch the realtime node
```bash
java -Xmx512m -Duser.timezone=UTC -Dfile.encoding=UTF-8 \
-Ddruid.realtime.specFile=examples/indexing/wikipedia.spec \
-classpath "config/_common:config/realtime:lib/*" \
io.druid.cli.Main server realtime
```
1. Copy and paste the following data into the terminal where we launched
the Kafka console producer above.
```json
{"timestamp": "2013-08-31T01:02:33Z", "page": "Gypsy Danger", "language" : "en", "user" : "nuclear", "unpatrolled" : "true", "newPage" : "true", "robot": "false", "anonymous": "false", "namespace":"article", "continent":"North America", "country":"United States", "region":"Bay Area", "city":"San Francisco", "added": 57, "deleted": 200, "delta": -143}
{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Cantebury", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330}
{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Oblast", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111}
{"timestamp": "2013-08-31T11:58:39Z", "page": "Crimson Typhoon", "language" : "zh", "user" : "triplets", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"China", "region":"Shanxi", "city":"Taiyuan", "added": 905, "deleted": 5, "delta": 900}
{"timestamp": "2013-08-31T12:41:27Z", "page": "Coyote Tango", "language" : "ja", "user" : "stringer", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"Japan", "region":"Kanto", "city":"Tokyo", "added": 1, "deleted": 10, "delta": -9}
```
**Note:** This config uses a [`messageTime` rejection policy](../design/plumber.html)
which will accept all events and hand off as long as there is a continuous
stream of events. In this particular example, hand-off will not actually
occur because we only have a handful of events.
Disclaimer: We recognize the timestamps of these events aren't actually recent.
1. Watch the events getting ingested and the real-time node announcing a data
segment
```
...
2015-02-17T23:01:50,220 INFO [chief-wikipedia] io.druid.server.coordination.BatchDataSegmentAnnouncer - Announcing segment[wikipedia_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_2013-08-31T00:00:00.000Z] at path[/druid/segments/localhost:8084/2015-02-17T23:01:50.219Z0]
...
```
1. Issue a query
Issuing a [TimeBoundaryQuery](../querying/timeboundaryquery.html) to the real-time node
should return some results:
```bash
curl -XPOST -H'Content-type: application/json' \
"http://localhost:8084/druid/v2/?pretty" \
-d'{"queryType":"timeBoundary","dataSource":"wikipedia"}'
```
```json
[ {
"timestamp" : "2013-08-31T01:02:33.000Z",
"result" : {
"minTime" : "2013-08-31T01:02:33.000Z",
"maxTime" : "2013-08-31T12:41:27.000Z"
}
} ]
```
## Advanced Streaming Ingestion
Druid offers an additional method of ingesting streaming data via the indexing service. You may be wondering why a second method is needed. Standalone real-time nodes are sufficient for certain volumes of data and availability tolerances. They pull data from a message queue like Kafka or Rabbit, index data locally, and periodically finalize segments for handoff to historical nodes. They are fairly straightforward to scale, simply taking advantage of the innate scalability of the backing message queue. But they are difficult to make highly available with Kafka, the most popular supported message queue, because its high-level consumer doesn’t provide a way to scale out two replicated consumer groups such that each one gets the same data in the same shard. They also become difficult to manage once you have a lot of them, since every machine needs a unique configuration.
Druid solved the availability problem by switching from a pull-based model to a push-based model; rather than Druid indexers pulling data from Kafka, another process pulls data and pushes the data into Druid. Since with the push based model, we can ensure that the same data makes it into the same shard, we can replicate data. The [indexing service](../design/indexing-service.html) encapsulates this functionality, where a task-and-resources model replaces a standalone machine model. In addition to simplifying machine configuration, the model also allows nodes to run in the cloud with an elastic number of machines. If you are interested in this form of real-time ingestion, please check out the client library [Tranquility](https://github.com/druid-io/tranquility).
Additional Information
----------------------
Getting data into Druid can definitely be difficult for first time users. Please don't hesitate to ask questions in our IRC channel or on our [google groups page](https://groups.google.com/forum/#!forum/druid-user).
---
layout: doc_page
---
## Load your own streaming data
## Getting started
This tutorial shows you how to load your own streams into Druid.
For this tutorial, we'll assume you've already downloaded Druid and Tranquility as described in
the [single-machine quickstart](quickstart.html) and have it running on your local machine. You
don't need to have loaded any data yet.
Once that's complete, you can load your own dataset by writing a custom ingestion spec.
## Writing an ingestion spec
When loading streams into Druid, we recommend using the [stream push](../ingestion/stream-push.html)
process. In this tutorial we'll be using [Tranquility Server](../ingestion/stream-ingestion.html#server) to push
data into Druid over HTTP.
```note-info
This tutorial will show you how to push streams to Druid using HTTP, but Druid additionally supports
a wide variety of batch and streaming loading methods. See the *[Loading files](batch-ingestion.html)*
and *[Loading streams](stream-ingestion.html)* pages for more information about other options,
including from Hadoop, Kafka, Storm, Samza, Spark Streaming, and your own JVM apps.
```
You can prepare for loading a new dataset over HTTP by writing a custom Tranquility Server
configuration. The bundled configuration is in `conf-quickstart/tranquility/server.json`, which
you can modify for your own needs.
The most important questions are:
* What should the dataset be called? This is the "dataSource" field of the "dataSchema".
* Which field should be treated as a timestamp? This belongs in the "column" field of the "timestampSpec".
* Which fields should be treated as dimensions? This belongs in the "dimensions" field of the "dimensionsSpec".
* Which fields should be treated as measures? This belongs in the "metricsSpec" field.
Let's use a small JSON pageviews dataset as an example, with records like:
```json
{"time": "2000-01-01T00:00:00Z", "url": "/foo/bar", "user": "alice", "latencyMs": 32}
```
So the answers to the questions above are:
* Let's call the dataset "pageviews".
* The timestamp is the "time" field.
* Good choices for dimensions are the string fields "url" and "user".
* Good choices for measures are a count of pageviews, and the sum of "latencyMs". Collecting that
sum when we load the data will allow us to compute an average at query time as well.
Now, edit the existing `conf-quickstart/tranquility/server.json` file by altering these
sections:
1. Change the key `"metrics"` under `"dataSources"` to `"pageviews"`
2. Alter these sections under the new `"pageviews"` key:
```json
"dataSource": "pageviews"
```
```json
"timestampSpec": {
"format": "auto",
"column": "time"
}
```
```json
"dimensionsSpec": {
"dimensions": ["url", "user"]
}
```
```json
"metricsSpec": [
{"name": "views", "type": "count"},
{"name": "latencyMs", "type": "doubleSum", "fieldName": "latencyMs"}
]
```
## Restarting the server
Restart the server to pick up the new configuration file by stopping Tranquility (CTRL-C) and starting it up again.
## Sending data
Let's send some data! We'll start with these three records:
```json
{"time": "2000-01-01T00:00:00Z", "url": "/foo/bar", "user": "alice", "latencyMs": 32}
{"time": "2000-01-01T00:00:00Z", "url": "/", "user": "bob", "latencyMs": 11}
{"time": "2000-01-01T00:00:00Z", "url": "/foo/bar", "user": "bob", "latencyMs": 45}
```
Druid streaming ingestion requires relatively current messages (relative to a slack time controlled by the
[windowPeriod](ingestion-streams.html#segmentgranularity-and-windowperiod) value), so you should
replace `2000-01-01T00:00:00Z` in these messages with the current time in ISO8601 format. You can
get this by running:
```bash
python -c 'import datetime; print(datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"))'
```
Update the timestamps in the JSON above, and save it to a file named `pageviews.json`. Then send
it to Druid by running:
```bash
curl -XPOST -H'Content-Type: application/json' --data-binary @pageviews.json http://localhost:8200/v1/post/pageviews
```
This will print something like:
```
{"result":{"received":3,"sent":3}}
```
This indicates that the HTTP server received 3 events from you, and sent 3 to Druid. Note that
this may take a few seconds to finish the first time you run it, as Druid resources must be
allocated to the ingestion task. Subsequent POSTs should complete quickly.
If you see `"sent":0` this likely means that your timestamps are not recent enough. Try adjusting
your timestamps and re-sending your data.
## Querying your data
After sending data, you can immediately query it using any of the
[supported query methods](../querying/querying.html).
## Further reading
To read more about loading streams, see our [streaming ingestion documentation](../ingestion/stream-ingestion.html).
---
layout: doc_page
---
# Tutorial: The Druid Cluster
Welcome back! In our first [tutorial](../tutorials/tutorial-a-first-look-at-druid.html), we introduced you to the most basic Druid setup: a single realtime node. We streamed in some data and queried it. Realtime nodes collect very recent data and periodically hand that data off to the rest of the Druid cluster. Some questions about the architecture must naturally come to mind. What does the rest of Druid cluster look like?
This tutorial will hopefully answer these questions!
In this tutorial, we will set up other types of Druid nodes and external dependencies for a fully functional Druid cluster. The architecture of Druid is very much like the [Megazord](http://www.youtube.com/watch?v=7mQuHh1X4H4) from the popular 90s show Mighty Morphin' Power Rangers. Each Druid node has a specific purpose and the nodes come together to form a fully functional system.
## Downloading Druid
If you followed the first tutorial, you should already have Druid downloaded. If not, let's go back and do that first.
You can download the latest version of druid [here](http://druid.io/downloads.html). You can also [Build From Source](../development/build.html) and grab the tarball from distribution/target/druid-<version>-bin.tar.gz.
Either way, once you have the tarball, untar the contents within by issuing:
```bash
tar -zxvf druid-<version>-bin.tar.gz
cd druid-<version>
```
## External Dependencies
Druid requires 3 external dependencies.
* A "deep storage" that acts as a data repository. This is generally distributed storage like HDFS or S3. For prototyping or experimentation on a single machine, Druid can use the local filesystem.
* A "metadata storage" to hold configuration and metadata information. This is generally a small, shared database like MySQL or PostgreSQL. For prototyping or experimentation on a single machine, Druid can use a local instance of [Apache Derby](http://db.apache.org/derby/).
* [Apache Zookeeper](http://zookeeper.apache.org/) for coordination among different pieces of the cluster.
For deep storage, we will use the local disk in this tutorial, but for production, HDFS and S3 are popular options. For the metadata storage, Derby is used in this tutorial, but for production MySQL or PostgreSQL etc should be used.
#### Set up Zookeeper
* Download zookeeper from [http://www.apache.org/dyn/closer.cgi/zookeeper/](http://www.apache.org/dyn/closer.cgi/zookeeper/)
* Install zookeeper.
```bash
curl http://www.gtlib.gatech.edu/pub/apache/zookeeper/zookeeper-3.4.6/zookeeper-3.4.6.tar.gz -o zookeeper-3.4.6.tar.gz
tar xzf zookeeper-3.4.6.tar.gz
cd zookeeper-3.4.6
cp conf/zoo_sample.cfg conf/zoo.cfg
./bin/zkServer.sh start
cd ..
```
## The Data
Similar to the first tutorial, the data we will be loading is based on edits that have occurred on Wikipedia. Every time someone edits a page in Wikipedia, metadata is generated about the editor and edited page. Druid collects each individual event and packages them together in a container known as a [segment](../design/segments.html). Segments contain data over some span of time. We've prebuilt a segment for this tutorial and will cover making your own segments in other [pages](../tutorials/tutorial-loading-streaming-data.html).The segment we are going to work with has the following format:
Dimensions (things to filter on):
```json
"page"
"language"
"user"
"unpatrolled"
"newPage"
"robot"
"anonymous"
"namespace"
"continent"
"country"
"region"
"city"
```
Metrics (things to aggregate over):
```json
"count"
"added"
"delta"
"deleted"
```
## The Cluster
Before we get started, let's make sure we have configs in the config directory for our various nodes. Issue the following from the Druid home directory:
```
ls config
```
If you are interested in learning more about Druid configuration files, check out this [link](../configuration/index.html). Many aspects of Druid are customizable. For the purposes of this tutorial, we are going to use default values for most things.
#### Common Configuration
There are a couple of cluster wide configuration options we have to define. The common/cluster configuration files should exist under:
```
config/_common
```
In the directory, there should be a `common.runtime.properties` file with the following contents:
```
# Extensions
druid.extensions.loadList=["druid-examples","druid-kafka-eight"]
# Zookeeper
druid.zk.service.host=localhost
# Deep storage (local filesystem for examples - don't use this in production)
druid.storage.type=local
druid.storage.storage.storageDirectory=/tmp/druid/localStorage
# Query Cache (we use a simple 10mb heap-based local cache on the broker)
druid.cache.type=local
druid.cache.sizeInBytes=10000000
# Indexing service discovery
druid.selectors.indexing.serviceName=overlord
# Monitoring (disabled for examples)
# druid.monitoring.monitors=["com.metamx.metrics.JvmMonitor"]
# Metrics logging (disabled for examples)
druid.emitter=noop
```
In this file we define our external dependencies and cluster wide configs.
#### Start a Coordinator Node
Coordinator nodes are in charge of load assignment and distribution. Coordinator nodes monitor the status of the cluster and command historical nodes to assign and drop segments.
For more information about coordinator nodes, see [here](../design/coordinator.html).
The coordinator config file should already exist at:
```
config/coordinator
```
In the directory, there should be a `runtime.properties` file with the following contents:
```
druid.host=localhost
druid.port=8081
druid.service=druid/coordinator
# The coordinator begins assignment operations after the start delay.
# We override the default here to start things up faster for examples.
druid.coordinator.startDelay=PT70s
```
To start the coordinator node:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath config/_common:config/coordinator:lib/* io.druid.cli.Main server coordinator
```
Note: we will be running a single historical node in these examples, so you may see some warnings about not being able to replicate segments. These can be safely ignored, but in production, you should always replicate segments across multiple historical nodes.
#### Start a Historical Node
Historical nodes are the workhorses of a cluster and are in charge of loading historical segments and making them available for queries. Realtime nodes hand off segments to historical nodes.
For more information about Historical nodes, see [here](../design/historical.html).
The historical config file should exist at:
```
config/historical
```
In the directory we just created, we should have the file `runtime.properties` with the following contents:
```
druid.host=localhost
druid.port=8083
druid.service=druid/historical
# We can only 1 scan segment in parallel with these configs.
# Our intermediate buffer is also very small so longer topNs will be slow.
druid.processing.buffer.sizeBytes=100000000
druid.processing.numThreads=1
druid.segmentCache.locations=[{"path": "/tmp/druid/indexCache", "maxSize"\: 10000000000}]
druid.server.maxSize=10000000000
```
To start the historical node:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath config/_common:config/historical:lib/* io.druid.cli.Main server historical
```
#### Start a Broker Node
Broker nodes are responsible for figuring out which historical and/or realtime nodes correspond to which queries. They also merge partial results from these nodes in a scatter/gather fashion.
For more information about Broker nodes, see [here](../design/broker.html).
The broker config file should exist at:
```
config/broker
```
In the directory, there should be a `runtime.properties` file with the following contents:
```
druid.host=localhost
druid.port=8082
druid.service=druid/broker
druid.broker.cache.useCache=true
druid.broker.cache.populateCache=true
# Bump these up only for faster nested groupBy
druid.processing.buffer.sizeBytes=100000000
druid.processing.numThreads=1
```
To start the broker node:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath config/_common:config/broker:lib/* io.druid.cli.Main server broker
```
#### Start a Realtime Node
Our goal is to ingest some data and hand-off that data to the rest of our Druid cluster. To accomplish this goal, we need to make some small configuration changes.
In your favorite editor, open up:
```
examples/wikipedia/wikipedia_realtime.spec
```
We need to change some configuration in order to force hand-off faster.
Let's change:
```
"segmentGranularity": "HOUR",
```
to
```
"segmentGranularity": "FIVE_MINUTE",
```
and
```
"intermediatePersistPeriod": "PT10m",
"windowPeriod": "PT10m",
```
to
```
"intermediatePersistPeriod": "PT3m",
"windowPeriod": "PT1m",
```
Now we should be handing off segments every 6 minutes or so.
To start the realtime node that was used in our first tutorial, you simply have to issue:
```
java -Xmx512m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -Ddruid.realtime.specFile=examples/wikipedia/wikipedia_realtime.spec -classpath config/_common:config/realtime:lib/* io.druid.cli.Main server realtime
```
The configurations are located in `config/realtime/runtime.properties` and should contain the following:
```
druid.host=localhost
druid.port=8084
druid.service=druid/realtime
# We can only 1 scan segment in parallel with these configs.
# Our intermediate buffer is also very small so longer topNs will be slow.
druid.processing.buffer.sizeBytes=100000000
druid.processing.numThreads=2
# Enable Real monitoring
# druid.monitoring.monitors=["com.metamx.metrics.JvmMonitor","io.druid.segment.realtime.RealtimeMetricsMonitor"]
```
Once the real-time node starts up, it should begin ingesting data and handing that data off to the rest of the Druid cluster. You can use a web UI located at coordinator_ip:port to view the status of data being loaded. Once data is handed off from the real-time nodes to historical nodes, the historical nodes should begin serving segments.
#### Query
At any point during ingestion, we can query for data. For example:
```
curl -X POST 'http://localhost:8082/druid/v2/?pretty' -H 'content-type: application/json' -d@examples/wikipedia/query.body
```
This query will span across both realtime and historical nodes. If you're curious, you can query the historical node directly by sending the same query to the historical node's port:
```
curl -X POST 'http://localhost:8083/druid/v2/?pretty' -H 'content-type: application/json' -d@examples/wikipedia/query.body
```
This query may produce no results if the realtime node hasn't run long enough to hand off the segment (we configured it above to be 5 minutes). Query the realtime node directly by sending the same query to the realtime node's port:
```
curl -X POST 'http://localhost:8084/druid/v2/?pretty' -H 'content-type: application/json' -d@examples/wikipedia/query.body
```
The realtime query results will reflect the data that was recently indexed from wikipedia, and not handed off to the historical node yet. Once the historical node acknowledges it has loaded the segment, the realtime node will drop the segment.
Querying the historical and realtime node directly is useful for understanding how the segment handling is working, but if you just want to run a query for all the data (realtime and historical), then send the query to the broker at port 8082 (which is what we did in the first example). The broker will send the query to the historical and realtime nodes and merge the results.
For more information on querying, see this [link](../querying/querying.html).
Next Steps
----------
If you are interested in how data flows through the different Druid components, check out the [Druid data flow architecture](../design/design.html). Now that you have an understanding of what the Druid cluster looks like, why not load some of your own data?
Check out the next [tutorial](../tutorials/tutorial-loading-streaming-data.html) section for more info!
[
{
"dataSchema" : {
"dataSource" : "wikipedia",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"timestampSpec" : {
"column" : "timestamp",
"format" : "auto"
},
"dimensionsSpec" : {
"dimensions": ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"],
"dimensionExclusions" : [],
"spatialDimensions" : []
}
}
},
"metricsSpec" : [{
"type" : "count",
"name" : "count"
}, {
"type" : "doubleSum",
"name" : "added",
"fieldName" : "added"
}, {
"type" : "doubleSum",
"name" : "deleted",
"fieldName" : "deleted"
}, {
"type" : "doubleSum",
"name" : "delta",
"fieldName" : "delta"
}],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "DAY",
"queryGranularity" : "NONE"
}
},
"ioConfig" : {
"type" : "realtime",
"firehose": {
"type": "kafka-0.8",
"consumerProps": {
"zookeeper.connect": "localhost:2181",
"zookeeper.connection.timeout.ms" : "15000",
"zookeeper.session.timeout.ms" : "15000",
"zookeeper.sync.time.ms" : "5000",
"group.id": "druid-example",
"fetch.message.max.bytes" : "1048586",
"auto.offset.reset": "largest",
"auto.commit.enable": "false"
},
"feed": "wikipedia"
},
"plumber": {
"type": "realtime"
}
},
"tuningConfig": {
"type" : "realtime",
"maxRowsInMemory": 500000,
"intermediatePersistPeriod": "PT10m",
"windowPeriod": "PT10m",
"basePersistDirectory": "\/tmp\/realtime\/basePersist",
"rejectionPolicy": {
"type": "messageTime"
}
}
}
]
{"timestamp": "2013-08-31T01:02:33Z", "page": "Gypsy Danger", "language" : "en", "user" : "nuclear", "unpatrolled" : "true", "newPage" : "true", "robot": "false", "anonymous": "false", "namespace":"article", "continent":"North America", "country":"United States", "region":"Bay Area", "city":"San Francisco", "added": 57, "deleted": 200, "delta": -143}
{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Cantebury", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330}
{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Oblast", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111}
{"timestamp": "2013-08-31T11:58:39Z", "page": "Crimson Typhoon", "language" : "zh", "user" : "triplets", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"China", "region":"Shanxi", "city":"Taiyuan", "added": 905, "deleted": 5, "delta": 900}
{"timestamp": "2013-08-31T12:41:27Z", "page": "Coyote Tango", "language" : "ja", "user" : "cancer", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"Japan", "region":"Kanto", "city":"Tokyo", "added": 1, "deleted": 10, "delta": -9}
\ No newline at end of file
{
"dataSchema": {
"dataSource": "wikipedia",
"parser": {
"type": "string",
"parseSpec": {
"format": "json",
"timestampSpec": {
"column": "timestamp",
"format": "auto"
},
"dimensionsSpec": {
"dimensions": [
"page",
"language",
"user",
"unpatrolled",
"newPage",
"robot",
"anonymous",
"namespace",
"continent",
"country",
"region",
"city"
],
"dimensionExclusions": [],
"spatialDimensions": []
}
}
},
"metricsSpec": [
{
"type": "count",
"name": "count"
},
{
"type": "doubleSum",
"name": "added",
"fieldName": "added"
},
{
"type": "doubleSum",
"name": "deleted",
"fieldName": "deleted"
},
{
"type": "doubleSum",
"name": "delta",
"fieldName": "delta"
}
],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "DAY",
"queryGranularity": "NONE",
"intervals": ["2013-08-31/2013-09-01"]
}
},
"ioConfig": {
"type": "hadoop",
"inputSpec": {
"type": "static",
"paths": "/myPath/druid-<version>/examples/indexing/wikipedia_data.json"
},
"metadataUpdateSpec": {
"type": "db",
"connectURI": "jdbc:mysql:\/\/localhost:3306\/druid",
"user": "druid",
"password": "diurd",
"segmentTable": "druid_segments"
},
"segmentOutputPath": "\/tmp\/segments"
},
"tuningConfig": {
"type": "hadoop",
"workingPath": "\/tmp\/working_path",
"partitionsSpec": {
"targetPartitionSize": 5000000
}
}
}
{
"type": "index_hadoop",
"spec": {
"dataSchema": {
"dataSource": "wikipedia",
"parser": {
"type": "string",
"parseSpec": {
"format": "json",
"timestampSpec": {
"column": "timestamp",
"format": "auto"
},
"dimensionsSpec": {
"dimensions": [
"page",
"language",
"user",
"unpatrolled",
"newPage",
"robot",
"anonymous",
"namespace",
"continent",
"country",
"region",
"city"
],
"dimensionExclusions": [],
"spatialDimensions": []
}
}
},
"metricsSpec": [
{
"type": "count",
"name": "count"
},
{
"type": "doubleSum",
"name": "added",
"fieldName": "added"
},
{
"type": "doubleSum",
"name": "deleted",
"fieldName": "deleted"
},
{
"type": "doubleSum",
"name": "delta",
"fieldName": "delta"
}
],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "DAY",
"queryGranularity": "NONE",
"intervals": ["2013-08-31/2013-09-01"]
}
},
"ioConfig": {
"type": "hadoop",
"inputSpec": {
"type": "static",
"paths": "examples/indexing/wikipedia_data.json"
}
},
"tuningConfig": {
"type": "hadoop",
"partitionsSpec": {
"targetPartitionSize": 5000000
}
}
}
}
#!/usr/bin/env bash
set +u
shopt -s xpg_echo
shopt -s expand_aliases
PF=./twitter4j.properties
# if twitter4j.properties already existed, then user is okay with having twitter pw in file, don't remove
if [ -e "created" ]; then
rm -f ${PF}
rm -f created
fi
#!/usr/bin/env bash
set +u
shopt -s xpg_echo
shopt -s expand_aliases
PF=./twitter4j.properties
# if twitter4j.properties already exists, then user is okay with having twitter pw in file.
# Otherwise a twitter4j.properties file in curr. dir. is made temporarily for twitter login.
if [ ! -e "$PF" ]; then
PF_CLEANUP="/bin/rm $PF"
trap "${PF_CLEANUP} ; exit 1" 1 2 3 15
touch created
touch $PF
chmod 700 $PF
echo " Your twitter OAuth information is needed. Go to https://twitter.com/oauth_clients/new to register a new application and retrieve your keys "
read -p 'Twitter consumer key? ' CONSUMER_KEY
read -p 'Twitter consumer secret? ' CONSUMER_SECRET
read -p 'Twitter access token? ' ACCESS_TOKEN
read -p 'Twitter access token secret? ' ACCESS_TOKEN_SECRET
echo "debug=true" >> $PF
echo "oauth.consumerKey=${CONSUMER_KEY}" >> $PF
echo "oauth.consumerSecret=${CONSUMER_SECRET}" >> $PF
echo "oauth.accessToken=${ACCESS_TOKEN}" >> $PF
echo "oauth.accessTokenSecret=${ACCESS_TOKEN_SECRET}" >> $PF
CONSUMER_SECRET=""
ACCESS_TOKEN_SECRET=""
fi
\ No newline at end of file
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册