The older name delimited_payload_filter
is deprecated and should not be used
with new indices. Use delimited_payload
instead.
Separates a token stream into tokens and payloads based on a specified delimiter.
For example, you can use the delimited_payload
filter with a |
delimiter to
split the|1 quick|2 fox|3
into the tokens the
, quick
, and fox
with respective payloads of 1
, 2
, and 3
.
This filter uses Lucene’s DelimitedPayloadTokenFilter.
Payloads
A payload is user-defined binary data associated with a token position and stored as base64-encoded bytes.
Elasticsearch does not store token payloads by default. To store payloads, you must:
-
Set the
term_vector
mapping parameter towith_positions_payloads
orwith_positions_offsets_payloads
for any field storing payloads. -
Use an index analyzer that includes the
delimited_payload
filter
You can view stored payloads using the term vectors API.
The following analyze API request uses the
delimited_payload
filter with the default |
delimiter to split
the|0 brown|10 fox|5 is|0 quick|10
into tokens and payloads.
resp = client.indices.analyze( tokenizer="whitespace", filter=[ "delimited_payload" ], text="the|0 brown|10 fox|5 is|0 quick|10", ) print(resp)
response = client.indices.analyze( body: { tokenizer: 'whitespace', filter: [ 'delimited_payload' ], text: 'the|0 brown|10 fox|5 is|0 quick|10' } ) puts response
const response = await client.indices.analyze({ tokenizer: "whitespace", filter: ["delimited_payload"], text: "the|0 brown|10 fox|5 is|0 quick|10", }); console.log(response);
GET _analyze { "tokenizer": "whitespace", "filter": ["delimited_payload"], "text": "the|0 brown|10 fox|5 is|0 quick|10" }
The filter produces the following tokens:
[ the, brown, fox, is, quick ]
Note that the analyze API does not return stored payloads. For an example that includes returned payloads, see Return stored payloads.
The following create index API request uses the
delimited-payload
filter to configure a new custom
analyzer.
resp = client.indices.create( index="delimited_payload", settings={ "analysis": { "analyzer": { "whitespace_delimited_payload": { "tokenizer": "whitespace", "filter": [ "delimited_payload" ] } } } }, ) print(resp)
response = client.indices.create( index: 'delimited_payload', body: { settings: { analysis: { analyzer: { whitespace_delimited_payload: { tokenizer: 'whitespace', filter: [ 'delimited_payload' ] } } } } } ) puts response
const response = await client.indices.create({ index: "delimited_payload", settings: { analysis: { analyzer: { whitespace_delimited_payload: { tokenizer: "whitespace", filter: ["delimited_payload"], }, }, }, }, }); console.log(response);
PUT delimited_payload { "settings": { "analysis": { "analyzer": { "whitespace_delimited_payload": { "tokenizer": "whitespace", "filter": [ "delimited_payload" ] } } } } }
-
delimiter
-
(Optional, string)
Character used to separate tokens from payloads. Defaults to
|
. -
encoding
-
(Optional, string) Data type for the stored payload. Valid values are:
-
float
- (Default) Float
-
identity
- Characters
-
int
- Integer
-
To customize the delimited_payload
filter, duplicate it to create the basis
for a new custom token filter. You can modify the filter using its configurable
parameters.
For example, the following create index API request
uses a custom delimited_payload
filter to configure a new
custom analyzer. The custom delimited_payload
filter uses the +
delimiter to separate tokens from payloads. Payloads are
encoded as integers.
resp = client.indices.create( index="delimited_payload_example", settings={ "analysis": { "analyzer": { "whitespace_plus_delimited": { "tokenizer": "whitespace", "filter": [ "plus_delimited" ] } }, "filter": { "plus_delimited": { "type": "delimited_payload", "delimiter": "+", "encoding": "int" } } } }, ) print(resp)
response = client.indices.create( index: 'delimited_payload_example', body: { settings: { analysis: { analyzer: { whitespace_plus_delimited: { tokenizer: 'whitespace', filter: [ 'plus_delimited' ] } }, filter: { plus_delimited: { type: 'delimited_payload', delimiter: '+', encoding: 'int' } } } } } ) puts response
const response = await client.indices.create({ index: "delimited_payload_example", settings: { analysis: { analyzer: { whitespace_plus_delimited: { tokenizer: "whitespace", filter: ["plus_delimited"], }, }, filter: { plus_delimited: { type: "delimited_payload", delimiter: "+", encoding: "int", }, }, }, }, }); console.log(response);
PUT delimited_payload_example { "settings": { "analysis": { "analyzer": { "whitespace_plus_delimited": { "tokenizer": "whitespace", "filter": [ "plus_delimited" ] } }, "filter": { "plus_delimited": { "type": "delimited_payload", "delimiter": "+", "encoding": "int" } } } } }
Use the create index API to create an index that:
- Includes a field that stores term vectors with payloads.
-
Uses a custom index analyzer with the
delimited_payload
filter.
resp = client.indices.create( index="text_payloads", mappings={ "properties": { "text": { "type": "text", "term_vector": "with_positions_payloads", "analyzer": "payload_delimiter" } } }, settings={ "analysis": { "analyzer": { "payload_delimiter": { "tokenizer": "whitespace", "filter": [ "delimited_payload" ] } } } }, ) print(resp)
response = client.indices.create( index: 'text_payloads', body: { mappings: { properties: { text: { type: 'text', term_vector: 'with_positions_payloads', analyzer: 'payload_delimiter' } } }, settings: { analysis: { analyzer: { payload_delimiter: { tokenizer: 'whitespace', filter: [ 'delimited_payload' ] } } } } } ) puts response
const response = await client.indices.create({ index: "text_payloads", mappings: { properties: { text: { type: "text", term_vector: "with_positions_payloads", analyzer: "payload_delimiter", }, }, }, settings: { analysis: { analyzer: { payload_delimiter: { tokenizer: "whitespace", filter: ["delimited_payload"], }, }, }, }, }); console.log(response);
PUT text_payloads { "mappings": { "properties": { "text": { "type": "text", "term_vector": "with_positions_payloads", "analyzer": "payload_delimiter" } } }, "settings": { "analysis": { "analyzer": { "payload_delimiter": { "tokenizer": "whitespace", "filter": [ "delimited_payload" ] } } } } }
Add a document containing payloads to the index.
resp = client.index( index="text_payloads", id="1", document={ "text": "the|0 brown|3 fox|4 is|0 quick|10" }, ) print(resp)
response = client.index( index: 'text_payloads', id: 1, body: { text: 'the|0 brown|3 fox|4 is|0 quick|10' } ) puts response
const response = await client.index({ index: "text_payloads", id: 1, document: { text: "the|0 brown|3 fox|4 is|0 quick|10", }, }); console.log(response);
POST text_payloads/_doc/1 { "text": "the|0 brown|3 fox|4 is|0 quick|10" }
Use the term vectors API to return the document’s tokens and base64-encoded payloads.
resp = client.termvectors( index="text_payloads", id="1", fields=[ "text" ], payloads=True, ) print(resp)
response = client.termvectors( index: 'text_payloads', id: 1, body: { fields: [ 'text' ], payloads: true } ) puts response
const response = await client.termvectors({ index: "text_payloads", id: 1, fields: ["text"], payloads: true, }); console.log(response);
GET text_payloads/_termvectors/1 { "fields": [ "text" ], "payloads": true }
The API returns the following response:
{ "_index": "text_payloads", "_id": "1", "_version": 1, "found": true, "took": 8, "term_vectors": { "text": { "field_statistics": { "sum_doc_freq": 5, "doc_count": 1, "sum_ttf": 5 }, "terms": { "brown": { "term_freq": 1, "tokens": [ { "position": 1, "payload": "QEAAAA==" } ] }, "fox": { "term_freq": 1, "tokens": [ { "position": 2, "payload": "QIAAAA==" } ] }, "is": { "term_freq": 1, "tokens": [ { "position": 3, "payload": "AAAAAA==" } ] }, "quick": { "term_freq": 1, "tokens": [ { "position": 4, "payload": "QSAAAA==" } ] }, "the": { "term_freq": 1, "tokens": [ { "position": 0, "payload": "AAAAAA==" } ] } } } } }