Web Scraper + Elasticsearch + Kibana + SearchKit 打造的豆瓣电影top250 搜索演示系统
Elasticsearch | 作者 森 | 发布于2023年04月09日 | | 阅读数:5525Web Scraper + Elasticsearch + Kibana + SearchKit 打造的豆瓣电影top250 搜索演示系统
作者:小森同学
声明:电影数据来源于“豆瓣电影”,如有侵权,请联系删除
Web Scraper
{
"_id": "top250",
"startUrl": ["https://movie.douban.com/top250?start=[0-225:25]&filter="],
"selectors": [{
"id": "container",
"multiple": true,
"parentSelectors": ["_root"],
"selector": ".grid_view li",
"type": "SelectorElement"
}, {
"id": "name",
"multiple": false,
"parentSelectors": ["container"],
"regex": "",
"selector": "span.title:nth-of-type(1)",
"type": "SelectorText"
}, {
"id": "number",
"multiple": false,
"parentSelectors": ["container"],
"regex": "",
"selector": "em",
"type": "SelectorText"
}, {
"id": "score",
"multiple": false,
"parentSelectors": ["container"],
"regex": "",
"selector": "span.rating_num",
"type": "SelectorText"
}, {
"id": "review",
"multiple": false,
"parentSelectors": ["container"],
"regex": "",
"selector": "span.inq",
"type": "SelectorText"
}, {
"id": "year",
"multiple": false,
"parentSelectors": ["container"],
"regex": "\\d{4}",
"selector": "p:nth-of-type(1)",
"type": "SelectorText"
}, {
"id": "tour_guide",
"multiple": false,
"parentSelectors": ["container"],
"regex": "^导演: \\S*",
"selector": "p:nth-of-type(1)",
"type": "SelectorText"
}, {
"id": "type",
"multiple": false,
"parentSelectors": ["container"],
"regex": "[^/]+$",
"selector": "p:nth-of-type(1)",
"type": "SelectorText"
}, {
"id": "area",
"multiple": false,
"parentSelectors": ["container"],
"regex": "[^\\/]+(?=\\/[^\\/]*$)",
"selector": "p:nth-of-type(1)",
"type": "SelectorText"
}, {
"id": "detail_link",
"multiple": false,
"parentSelectors": ["container"],
"selector": ".hd a",
"type": "SelectorLink"
}, {
"id": "director",
"multiple": false,
"parentSelectors": ["detail_link"],
"regex": "",
"selector": "span:nth-of-type(1) .attrs a",
"type": "SelectorText"
}, {
"id": "screenwriter",
"multiple": false,
"parentSelectors": ["detail_link"],
"regex": "(?<=编剧: )[\\u4e00-\\u9fa5A-Za-z0-9/()\\·\\s]+(?=主演)",
"selector": "div#info",
"type": "SelectorText"
}, {
"id": "film_length",
"multiple": false,
"parentSelectors": ["detail_link"],
"regex": "\\d+",
"selector": "span[property='v:runtime']",
"type": "SelectorText"
}, {
"id": "IMDb",
"multiple": false,
"parentSelectors": ["detail_link"],
"regex": "(?<=[IMDb:\\s+])\\S*(?=\\d*$)",
"selector": "div#info",
"type": "SelectorText"
}, {
"id": "language",
"multiple": false,
"parentSelectors": ["detail_link"],
"regex": "(?<=语言: )\\S+",
"selector": "div#info",
"type": "SelectorText"
}, {
"id": "alias",
"multiple": false,
"parentSelectors": ["detail_link"],
"regex": "(?<=又名: )[\\u4e00-\\u9fa5A-Za-z0-9/()\\s]+(?=IMDb)",
"selector": "div#info",
"type": "SelectorText"
}, {
"id": "pic",
"multiple": false,
"parentSelectors": ["container"],
"selector": "img",
"type": "SelectorImage"
}]
}
elasticsearch
{
"mappings": {
"properties": {
"IMDb": {
"type": "keyword",
"copy_to": [
"all"
]
},
"alias": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"all"
],
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
},
"all": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
},
"area": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"all"
],
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
},
"director": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"all"
],
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
},
"film_length": {
"type": "long"
},
"id": {
"type": "keyword"
},
"language": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"all"
],
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
},
"link": {
"type": "keyword"
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"all"
],
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
},
"number": {
"type": "long"
},
"photo": {
"type": "keyword"
},
"review": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"all"
],
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
},
"score": {
"type": "double"
},
"screenwriter": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"all"
],
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
},
"type": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"all"
],
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
},
"year": {
"type": "long"
}
}
}
}
kibana
需要使用pipeline对索引字段进行处理,如对type 通过空格进行分割为数组等,可以参照官方文档或其他博客。
制作仪表板省略, 请自行搜索
SearchKit
[尊重社区原创,转载请保留或注明出处]
本文地址:http://searchkit.cn/article/14863
本文地址:http://searchkit.cn/article/14863