PUT test{"settings": {"index": {"number_of_shards": "1","number_of_replicas": "0"}},"mappings": {"doc": { // ES 7.x and later do not require type to be specified when creating an index. There will be a default and unique `_doc` type"properties": {"k1": {"type": "long"},"k2": {"type": "date"},"k3": {"type": "keyword"},"k4": {"type": "text","analyzer": "standard"},"k5": {"type": "float"}}}}}
OST /_bulk{"index":{"_index":"test","_type":"doc"}}{ "k1" : 100, "k2": "2020-01-01", "k3": "Trying out Elasticsearch", "k4": "Trying out Elasticsearch", "k5": 10.0}{"index":{"_index":"test","_type":"doc"}}{ "k1" : 100, "k2": "2020-01-01", "k3": "Trying out Doris", "k4": "Trying out Doris", "k5": 10.0}{"index":{"_index":"test","_type":"doc"}}{ "k1" : 100, "k2": "2020-01-01", "k3": "Doris On ES", "k4": "Doris On ES", "k5": 10.0}{"index":{"_index":"test","_type":"doc"}}{ "k1" : 100, "k2": "2020-01-01", "k3": "Doris", "k4": "Doris", "k5": 10.0}{"index":{"_index":"test","_type":"doc"}}{ "k1" : 100, "k2": "2020-01-01", "k3": "ES", "k4": "ES", "k5": 10.0}
CREATE EXTERNAL TABLE `test` // schema is not specified. Automatically ES mapping is pulled for table creationENGINE=ELASTICSEARCHPROPERTIES ("hosts" = "http://192.168.0.1:8200,http://192.168.0.2:8200","index" = "test","type" = "doc","user" = "root","password" = "root");CREATE EXTERNAL TABLE `test` (`k1` bigint(20) COMMENT "",`k2` datetime COMMENT "",`k3` varchar(20) COMMENT "",`k4` varchar(100) COMMENT "",`k5` float COMMENT "") ENGINE=ELASTICSEARCH // The ENGINE must be ElasticsearchPROPERTIES ("hosts" = "http://192.168.0.1:8200,http://192.168.0.2:8200","index" = "test","type" = "doc","user" = "root","password" = "root");
Parameter | Description |
hosts | ES cluster address, can be one or multiple addresses, or the load balancer address of ES |
index | The index name of the corresponding ES, supporting alias. If using doc_value, you need to use the real name. |
type | The index type. This parameter is not required to import for ES 7.x and later versions. |
user | ES cluster username |
password | Cipher information for the application |
Doris On ES
is the pushdown of filter conditions: the conditions are pushed down to ES, so only data that truly meets the criteria is returned, significantly improving query performance and reducing the CPU, memory, and IO usage of both Doris and Elasticsearch.enable_new_es_dsl
indicates whether to use the new DSL generation logic. All subsequent bug fixes and iterations are developed in the new DSL. The value is true
by default, and can be modified in fe.conf
.SQL syntax | ES 5.x+ syntax |
= | term query |
in | terms query |
> , < , >= , ⇐ | range query |
and | bool.filter |
or | bool.should |
not | bool.must_not |
not in | bool.must_not + terms query |
is_not_null | exists query |
is_null | bool.must_not + exists query |
esquery | ES Native json format QueryDSL |
Doris/ES | byte | short | integer | long | float | double | keyword | text | date |
tinyint | ✓ | - | - | - | - | - | - | - | - |
smallint | ✓ | ✓ | - | - | - | - | - | - | - |
int | ✓ | ✓ | ✓ | - | - | - | - | - | - |
bigint | ✓ | ✓ | ✓ | ✓ | - | - | - | - | - |
float | - | - | - | - | ✓ | - | - | - | - |
double | - | - | - | - | - | ✓ | - | - | - |
char | - | - | - | - | - | - | ✓ | ✓ | - |
varchar | - | - | - | - | - | - | ✓ | ✓ | - |
date | - | - | - | - | - | - | - | - | ✓ |
datetime | - | - | - | - | - | - | - | - | ✓ |
CREATE EXTERNAL TABLE `test` (`k1` bigint(20) COMMENT "",`k2` datetime COMMENT "",`k3` varchar(20) COMMENT "",`k4` varchar(100) COMMENT "",`k5` float COMMENT "") ENGINE=ELASTICSEARCHPROPERTIES ("hosts" = "http://192.168.0.1:8200,http://192.168.0.2:8200","index" = "test","user" = "root","password" = "root","enable_docvalue_scan" = "true");
Parameter | Description |
enable_docvalue_scan | Whether to enable the function of obtaining the value of queried field through ES/Lucene columnar storage. The default value is false. |
_source
._source
. The _source
is in row storage and JSON format. Compared to columnar storage, it is slow in batch read. In particular, when the system only needs to read small number of columns, the performance of docvalue can be about a dozen times faster than that of _source
.text
type fields in ES do not have columnar storage, so if the field value that needs to be accessed is of text
type, it will automatically downgrade to access from _source
.> = 25
), the performance of accessing field values from docvalue
will be basically the same as accessing field values from _source
.CREATE EXTERNAL TABLE `test` (`k1` bigint(20) COMMENT "",`k2` datetime COMMENT "",`k3` varchar(20) COMMENT "",`k4` varchar(100) COMMENT "",`k5` float COMMENT "") ENGINE=ELASTICSEARCHPROPERTIES ("hosts" = "http://192.168.0.1:8200,http://192.168.0.2:8200","index" = "test","user" = "root","password" = "root","enable_keyword_sniff" = "true");
Parameter | Description |
enable_keyword_sniff | Whether to sniff string type tokenization type (text) fields in ES, and obtain extra non-tokenized (keyword) field name (multi-fields mechanism) |
text
and keyword
types. This is how the multi-fields feature of ES works. The mapping is as follows:"k4": {"type": "text","fields": {"keyword": {"type": "keyword","ignore_above": 256}}}
k4 = "Doris On ES"
"term" : {"k4": "Doris On ES"}
text
, it will be tokenized by the analyzer set for k4 (or by the standard analyzer if no analyzer has been set for k4) after data ingestion. As a result, it will be tokenized into three terms: Doris, On, and ES. The details are as follows:POST /_analyze{"analyzer": "standard","text": "Doris On ES"}
{"tokens": [{"token": "doris","start_offset": 0,"end_offset": 5,"type": "<ALPHANUM>","position": 0},{"token": "on","start_offset": 6,"end_offset": 8,"type": "<ALPHANUM>","position": 1},{"token": "es","start_offset": 9,"end_offset": 11,"type": "<ALPHANUM>","position": 2}]}
"term" : {"k4": "Doris On ES"}
Doris On ES
this term does not match any term in the dictionary, will not return any results, while enabling enable_keyword_sniff: true
will automatically convert k4 = "Doris On ES"
to k4.keyword = "Doris On ES"
to fully match SQL semantics, the converted ES query DSL is:"term" : {"k4.keyword": "Doris On ES"}
k4.keyword
is of type keyword
, and when data is written into ES, it is a complete term, so it can match.CREATE EXTERNAL TABLE `test` (`k1` bigint(20) COMMENT "",`k2` datetime COMMENT "",`k3` varchar(20) COMMENT "",`k4` varchar(100) COMMENT "",`k5` float COMMENT "") ENGINE=ELASTICSEARCHPROPERTIES ("hosts" = "http://192.168.0.1:8200,http://192.168.0.2:8200","index" = "test","user" = "root","password" = "root","nodes_discovery" = "true");
Parameter | Description |
nodes_discovery | Whether to start EST node discovery. The default value is true. |
true
. The default value is false(http_ssl_enabled=true).CREATE EXTERNAL TABLE `test` (`k1` bigint(20) COMMENT "",`k2` datetime COMMENT "",`k3` varchar(20) COMMENT "",`k4` varchar(100) COMMENT "",`k5` float COMMENT "") ENGINE=ELASTICSEARCHPROPERTIES ("hosts" = "http://192.168.0.1:8200,http://192.168.0.2:8200","index" = "test","user" = "root","password" = "root","http_ssl_enabled" = "true");
Parameter | Description |
http_ssl_enabled | Is HTTPS access pattern enabled for the ES cluster |
select * from es_table where k1 > 1000 and k3 ='term' or k4 like 'fu*z_'
esquery(field, QueryDSL)
function can be used to pushe queries that cannot be expressed in SQL, such as match_phrase, and geoshape, etc., to ES for filtering. The first parameter ofesquery
is used to associate withindex
, while the second parameter is the JSON expression of basicQuery DSL
in ES, enclosed by{}
. Theroot key
in JSON is unique, which can be match_phrase, geo_shape, or bool, etc.
A match_phrase query:select * from es_table where esquery(k4, ' {"match_phrase": {"k4": "doris on es"}}');
select * from es_table where esquery(k4, ' {"geo_shape": {"location": {"shape": {"type": "envelope","coordinates": [[13,53],[14,52]]},"relation": "within"}}}');
select * from es_table where esquery(k4, ' {"bool": {"must": [{"terms": {"k1": [11,12]}},{"terms": {"k2": [100]}}]}}');
+----------------------------------------------+| || Doris +------------------+ || | FE +--------------+-------+| | | Request Shard Location| +--+-------------+-+ | || ^ ^ | || | | | || +-------------------+ +------------------+ | || | | | | | | | || | +----------+----+ | | +--+-----------+ | | || | | BE | | | | BE | | | || | +---------------+ | | +--------------+ | | |+----------------------------------------------+ || | | | | | || | | | | | || HTTP SCROLL | | HTTP SCROLL | |+-----------+---------------------+------------+ || | v | | v | | || | +------+--------+ | | +------+-------+ | | || | | | | | | | | | || | | DataNode | | | | DataNode +<-----------+| | | | | | | | | | || | | +<--------------------------------+| | +---------------+ | | |--------------| | | || +-------------------+ +------------------+ | || Same Physical Node | || | || +-----------------------+ | || | | | || | MasterNode +<-----------------+| ES | | || +-----------------------+ |+----------------------------------------------+
_source
or docvalue
from each tablet of ES index concurrently by way of HTTP Scroll
."dt": {"type": "date","format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"}
date
or datetime
(or varchar
). You can use the following SQL statements to push the filter conditions down to ES:select * from doe where k2 > '2020-06-21';select * from doe where k2 < '2020-06-21 12:00:00';select * from doe where k2 < 1593497011;select * from doe where k2 < now();select * from doe where k2 < date_format(now(), '%Y-%m-%d');
format
of the time field in ES is not set, the default time field format is:strict_date_optional_time||epoch_millis
ms
. ms
is the internal processing format in ES. Otherwise, errors will occur in ES external tables._id
_id
, will be given a globally unique _id
, which is the primary key. Users can assign an _id
with unique business meaning to the files during importing. To obtain such field values from Doris On ES, you can add a varchar
-typed _id
field when creating tables.CREATE EXTERNAL TABLE `doe` (`_id` varchar COMMENT "",`city` varchar COMMENT "") ENGINE=ELASTICSEARCHPROPERTIES ("hosts" = "http://127.0.0.1:8200","user" = "root","password" = "root","index" = "doe"}
_id
field only supports =
and in
filtering._id
field must be of varchar
type.
Was this page helpful?