html strip character filter
edithtml strip character filter
editStrips html elements from a text and replaces html entities with their decoded
value (e.g, replaces & with &).
The html_strip filter uses Lucene’s
htmlStripCharFilter.
Example
editThe following analyze API request uses the
html_strip filter to change the text <p>I'm so <b>happy</b>!</p> to
\nI'm so happy!\n.
resp = client.indices.analyze(
tokenizer="keyword",
char_filter=[
"html_strip"
],
text="I'm so happy</b>!</p>",
)
print(resp)
response = client.indices.analyze(
body: {
tokenizer: 'keyword',
char_filter: [
'html_strip'
],
text: 'I'm so happy</b>!</p>'
}
)
puts response
const response = await client.indices.analyze({
tokenizer: "keyword",
char_filter: ["html_strip"],
text: "I'm so happy</b>!</p>",
});
console.log(response);
GET /_analyze
{
"tokenizer": "keyword",
"char_filter": [
"html_strip"
],
"text": "<p>I'm so <b>happy</b>!</p>"
}
The filter produces the following text:
[ \nI'm so happy!\n ]
Add to an analyzer
editThe following create index API request uses the
html_strip filter to configure a new
custom analyzer.
resp = client.indices.create(
index="my-index-000001",
settings={
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "keyword",
"char_filter": [
"html_strip"
]
}
}
}
},
)
print(resp)
response = client.indices.create(
index: 'my-index-000001',
body: {
settings: {
analysis: {
analyzer: {
my_analyzer: {
tokenizer: 'keyword',
char_filter: [
'html_strip'
]
}
}
}
}
}
)
puts response
const response = await client.indices.create({
index: "my-index-000001",
settings: {
analysis: {
analyzer: {
my_analyzer: {
tokenizer: "keyword",
char_filter: ["html_strip"],
},
},
},
},
});
console.log(response);
PUT /my-index-000001
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "keyword",
"char_filter": [
"html_strip"
]
}
}
}
}
}
Configurable parameters
edit-
escaped_tags -
(Optional, array of strings)
Array of html elements without enclosing angle brackets (
< >). The filter skips these html elements when stripping html from the text. For example, a value of[ "p" ]skips the<p>html element.
Customize
editTo customize the html_strip filter, duplicate it to create the basis for a new
custom character filter. You can modify the filter using its configurable
parameters.
The following create index API request
configures a new custom analyzer using a custom
html_strip filter, my_custom_html_strip_char_filter.
The my_custom_html_strip_char_filter filter skips the removal of the <b>
html element.
resp = client.indices.create(
index="my-index-000001",
settings={
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "keyword",
"char_filter": [
"my_custom_html_strip_char_filter"
]
}
},
"char_filter": {
"my_custom_html_strip_char_filter": {
"type": "html_strip",
"escaped_tags": [
"b"
]
}
}
}
},
)
print(resp)
response = client.indices.create(
index: 'my-index-000001',
body: {
settings: {
analysis: {
analyzer: {
my_analyzer: {
tokenizer: 'keyword',
char_filter: [
'my_custom_html_strip_char_filter'
]
}
},
char_filter: {
my_custom_html_strip_char_filter: {
type: 'html_strip',
escaped_tags: [
'b'
]
}
}
}
}
}
)
puts response
const response = await client.indices.create({
index: "my-index-000001",
settings: {
analysis: {
analyzer: {
my_analyzer: {
tokenizer: "keyword",
char_filter: ["my_custom_html_strip_char_filter"],
},
},
char_filter: {
my_custom_html_strip_char_filter: {
type: "html_strip",
escaped_tags: ["b"],
},
},
},
},
});
console.log(response);
PUT my-index-000001
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "keyword",
"char_filter": [
"my_custom_html_strip_char_filter"
]
}
},
"char_filter": {
"my_custom_html_strip_char_filter": {
"type": "html_strip",
"escaped_tags": [
"b"
]
}
}
}
}
}