当前位置：首页 > backend >正文

【lucene】使用docvalues的案例

backend 2025/8/1 12:27:55

下面给出一段可直接跑通的 Lucene 8.5.0 示例代码，演示如何

1. 建索引时为两个字段启用 DocValues（一个 `NumericDocValues`，一个 `SortedDocValues`）；

2. 用 `IndexSearcher` 按 DocValues 排序；

3. 用 `FacetsCollector` 做分组统计（相当于 SQL 的 `GROUP BY`）。

```java

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.codecs.lucene50.Lucene50Codec;

import org.apache.lucene.document.*;

import org.apache.lucene.index.*;

import org.apache.lucene.search.*;

import org.apache.lucene.store.ByteBuffersDirectory;

import org.apache.lucene.util.NumericUtils;

import org.apache.lucene.facet.*;

import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts;

import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField;

public class DocValuesDemo {

public static void main(String[] args) throws Exception {

/* ---------- 1. 创建内存目录 ---------- */

Directory dir = new ByteBuffersDirectory();

IndexWriterConfig cfg = new IndexWriterConfig(new StandardAnalyzer());

// 使用默认 codec 即可，DocValues 默认开启

cfg.setCodec(new Lucene50Codec());

IndexWriter writer = new IndexWriter(dir, cfg);

/* ---------- 2. 写入三条示范文档 ---------- */

Document doc1 = new Document();

doc1.add(new StringField("id", "1", Field.Store.YES));

// 商品价：NumericDocValues，可排序、可聚合

doc1.add(new NumericDocValuesField("price", 2999));

// 商品品牌：SortedDocValues，可做 faceting

doc1.add(new SortedSetDocValuesFacetField("brand", "小米"));

Document doc2 = new Document();

doc2.add(new StringField("id", "2", Field.Store.YES));

doc2.add(new NumericDocValuesField("price", 3999));

doc2.add(new SortedSetDocValuesFacetField("brand", "苹果"));

Document doc3 = new Document();

doc3.add(new StringField("id", "3", Field.Store.YES));

doc3.add(new NumericDocValuesField("price", 1999));

doc3.add(new SortedSetDocValuesFacetField("brand", "小米"));

writer.addDocument(doc1);

writer.addDocument(doc2);

writer.addDocument(doc3);

writer.commit();

writer.close();

/* ---------- 3. 打开搜索器 ---------- */

DirectoryReader reader = DirectoryReader.open(dir);

IndexSearcher searcher = new IndexSearcher(reader);

/* ---------- 4. 按价格升序排序 ---------- */

Sort sort = new Sort(new SortField("price", SortField.Type.LONG));

TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 10, sort);

for (ScoreDoc sd : topDocs.scoreDocs) {

Document d = searcher.doc(sd.doc);

System.out.println("id=" + d.get("id"));

}

// 输出：

// id=3 (1999)

// id=1 (2999)

// id=2 (3999)

/* ---------- 5. 用 Facets 统计品牌分布 ---------- */

FacetsCollector fc = new FacetsCollector();

searcher.search(new MatchAllDocsQuery(), fc);

Facets facets = new SortedSetDocValuesFacetCounts(fc);

FacetResult result = facets.getTopChildren(10, "brand");

for (LabelAndValue lav : result.labelValues) {

System.out.println(lav.label + " : " + lav.value);

}

// 输出：

// 小米 : 2

// 苹果 : 1

reader.close();

dir.close();

}

```

---

关键 API 回顾

目标用到的 Lucene 类/方法

声明数值 DocValues `NumericDocValuesField`

声明字符串 DocValues `SortedSetDocValuesFacetField`

排序 `SortField("price", Type.LONG)`

分组/聚合 `FacetsCollector + SortedSetDocValuesFacetCounts`

这样就完整展示了在 Lucene 8.5.0 中如何启用、读取并利用 DocValues 进行排序和聚合的完整链路。

下面给出一份可直接拷贝到 IDE 运行的 Lucene 8.5.0 完整示例，演示如何

1. 索引阶段：
- 用 NumericDocValues 存价格（可排序/聚合）
- 用 SortedSetDocValuesFacetField 存品牌（可做分面统计）

2. 查询阶段：
- 按价格排序
- 统计品牌分布
- 钻取（drill-down）过滤后再统计

---

✅ Maven 依赖（lucene-core + lucene-facet）

```xml
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>8.5.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-facet</artifactId>
<version>8.5.0</version>
</dependency>
```

---

✅ 代码示例

```java
package demo;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.facet.*;
import org.apache.lucene.facet.sortedset.*;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;

import java.util.List;

public class DocValuesDemo {
public static void main(String[] args) throws Exception {
Directory dir = new ByteBuffersDirectory();
IndexWriterConfig cfg = new IndexWriterConfig(new StandardAnalyzer());
IndexWriter writer = new IndexWriter(dir, cfg);

FacetsConfig config = new FacetsConfig(); // 必须

// 准备 3 条测试数据
addDoc(writer, config, "1", 1999, "小米");
addDoc(writer, config, "2", 3999, "苹果");
addDoc(writer, config, "3", 2999, "小米");

writer.commit();
writer.close();

/* ---------- 查询 ---------- */
DirectoryReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);

/* 1. 按价格排序（NumericDocValues） */
Sort sort = new Sort(new SortField("price", SortField.Type.LONG));
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 10, sort);
System.out.println("按价格排序：");
for (ScoreDoc sd : topDocs.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println("id=" + doc.get("id") +
", 价格=" + doc.get("price") +
", 品牌=" + doc.get("brand"));
}

/* 2. 品牌分面统计（SortedSetDocValuesFacetField） */
SortedSetDocValuesReaderState state =
new DefaultSortedSetDocValuesReaderState(reader);
FacetsCollector fc = new FacetsCollector();
FacetsCollector.search(searcher, new MatchAllDocsQuery(), 10, fc);
Facets facets = new SortedSetDocValuesFacetCounts(state, fc);
FacetResult brandResult = facets.getTopChildren(10, "brand");
System.out.println("\n品牌统计：");
for (LabelAndValue lv : brandResult.labelValues) {
System.out.println(lv.label + " : " + lv.value);
}

/* 3. 钻取：只看 2010 年出版的书（示例维度） */
DrillDownQuery dq = new DrillDownQuery(config);
dq.add("brand", "小米"); // 钻取小米
FacetsCollector fc2 = new FacetsCollector();
FacetsCollector.search(searcher, dq, 10, fc2);
Facets facets2 = new SortedSetDocValuesFacetCounts(state, fc2);
FacetResult afterDrill = facets2.getTopChildren(10, "brand");
System.out.println("\n钻取后品牌统计：");
System.out.println(afterDrill);

reader.close();
dir.close();
}

private static void addDoc(IndexWriter w,
FacetsConfig config,
String id,
int price,
String brand) throws Exception {
Document doc = new Document();
doc.add(new StringField("id", id, Field.Store.YES)); // 普通字段
doc.add(new NumericDocValuesField("price", price)); // 数值 DocValues
doc.add(new SortedSetDocValuesFacetField("brand", brand)); // 分面 DocValues
w.addDocument(config.build(doc)); // 必须用 FacetsConfig.build
}
}
```

---

✅ 运行结果示例

```
按价格排序：
id=1, 价格=1999, 品牌=小米
id=3, 价格=2999, 品牌=小米
id=2, 价格=3999, 品牌=苹果

品牌统计：
小米 : 2
苹果 : 1

钻取后品牌统计：
dim=brand path=[小米] value=2 childCount=0
```

---

✅ 小结

- NumericDocValues → 排序、范围聚合
- SortedSetDocValuesFacetField + FacetsConfig → 分面统计、钻取
- 代码直接基于 Lucene 8.5.0，无需额外 codec 设置。

在 Lucene 中，`doc.get("price")` 返回 `null` 是因为 `price` 字段被定义为 `NumericDocValuesField`，而 `NumericDocValuesField` 不会存储字段值。

它只用于 DocValues 索引，供排序、聚合等操作使用，但不会把值存入 `Document`。

---

✅ 解释

- `NumericDocValuesField`

- 用途：用于快速排序、聚合等操作，数据存储在 `.dvd` 文件中。

- 特点：不存储字段值，只能通过 `IndexSearcher` 的 `DocValues` 接口访问。

- `StoredField`

- 用途：用于存储字段值，可以在 `Document` 中直接访问。

- 特点：数据存储在 `.fdt` 文件中。

---

✅ 解决方法

如果你想在 `Document` 中直接访问字段值，需要同时添加一个 `StoredField`：

```java

doc.add(new NumericDocValuesField("price", price)); // 用于 DocValues 索引

doc.add(new StoredField("price", price)); // 用于存储字段值

```

---

✅ 完整示例

```java

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.*;

import org.apache.lucene.facet.*;

import org.apache.lucene.facet.sortedset.*;

import org.apache.lucene.index.*;

import org.apache.lucene.search.*;

import org.apache.lucene.store.ByteBuffersDirectory;

public class DocValuesDemo {

public static void main(String[] args) throws Exception {

Directory dir = new ByteBuffersDirectory();

IndexWriterConfig cfg = new IndexWriterConfig(new StandardAnalyzer());

IndexWriter writer = new IndexWriter(dir, cfg);

FacetsConfig config = new FacetsConfig();

// 添加文档

addDoc(writer, config, "1", 1999, "小米");

addDoc(writer, config, "2", 3999, "苹果");

addDoc(writer, config, "3", 2999, "小米");

writer.commit();

writer.close();

// 打开索引

DirectoryReader reader = DirectoryReader.open(dir);

IndexSearcher searcher = new IndexSearcher(reader);

// 按价格排序

Sort sort = new Sort(new SortField("price", SortField.Type.LONG));

TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 10, sort);

System.out.println("按价格排序：");

for (ScoreDoc sd : topDocs.scoreDocs) {

Document doc = searcher.doc(sd.doc);

System.out.println("id=" + doc.get("id") +

", 价格=" + doc.get("price") + // 这里可以访问 price

", 品牌=" + doc.get("brand"));

}

reader.close();

dir.close();

}

private static void addDoc(IndexWriter w,

FacetsConfig config,

String id,

int price,

String brand) throws Exception {

Document doc = new Document();

doc.add(new StringField("id", id, Field.Store.YES));

doc.add(new NumericDocValuesField("price", price)); // 用于 DocValues 索引

doc.add(new StoredField("price", price)); // 用于存储字段值

doc.add(new SortedSetDocValuesFacetField("brand", brand));

w.addDocument(config.build(doc));

}

```

---

✅ 输出结果

```

按价格排序：

id=1, 价格=1999, 品牌=小米

id=3, 价格=2999, 品牌=小米

id=2, 价格=3999, 品牌=苹果

```

---

✅ 总结

- `NumericDocValuesField`：用于 DocValues 索引，不存储字段值。

- `StoredField`：用于存储字段值，可以在 `Document` 中直接访问。

- 如果需要同时支持 DocValues 索引和字段值访问，需要同时添加这两个字段。

也就是说docvalues是一种索引，不是存内容的只有storefield设为yes才是存原文的

对，你总结得很准确！

✅ DocValues 的本质

DocValues 是一种索引结构，用于高效地存储和访问列式数据（如数值、字符串等），主要用于以下场景：

1. 排序：快速按字段值排序（如价格、日期等）。

2. 聚合：快速计算字段值的统计信息（如最大值、最小值、平均值等）。

3. 分面统计：快速统计字段值的分布（如品牌、类别等）。

DocValues 不存储原始文档内容，而是将字段值编码到索引文件中（如 `.dvd` 和 `.dvm` 文件），供快速读取和计算。

---

✅ `StoredField` 的作用

`StoredField` 用于存储原始字段值，以便在检索时可以直接访问这些值。它的特点如下：

- 存储原始值：将字段值存储在 `.fdt` 文件中。

- 可直接访问：通过 `Document` 对象访问存储的字段值。

- 不参与索引：不会影响搜索、排序或聚合操作。

---

✅ 对比

特性 DocValues StoredField

用途索引结构，用于排序、聚合、分面统计存储结构，用于存储原始字段值

存储位置 `.dvd` 和 `.dvm` 文件 `.fdt` 文件

是否可直接访问不可直接访问，需通过 `DocValues` 接口可直接通过 `Document` 访问

是否参与索引参与索引，用于快速读取和计算不参与索引，仅存储原始值

---

✅ 示例代码

```java

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.*;

import org.apache.lucene.index.*;

import org.apache.lucene.search.*;

import org.apache.lucene.store.ByteBuffersDirectory;

public class DocValuesDemo {

public static void main(String[] args) throws Exception {

Directory dir = new ByteBuffersDirectory();

IndexWriterConfig cfg = new IndexWriterConfig(new StandardAnalyzer());

IndexWriter writer = new IndexWriter(dir, cfg);

// 添加文档

Document doc1 = new Document();

doc1.add(new StringField("id", "1", Field.Store.YES));

doc1.add(new NumericDocValuesField("price", 1999)); // DocValues 索引

doc1.add(new StoredField("price", 1999)); // 存储原始值

writer.addDocument(doc1);

Document doc2 = new Document();

doc2.add(new StringField("id", "2", Field.Store.YES));

doc2.add(new NumericDocValuesField("price", 3999)); // DocValues 索引

doc2.add(new StoredField("price", 3999)); // 存储原始值

writer.addDocument(doc2);

writer.commit();

writer.close();

// 打开索引

DirectoryReader reader = DirectoryReader.open(dir);

IndexSearcher searcher = new IndexSearcher(reader);

// 按价格排序

Sort sort = new Sort(new SortField("price", SortField.Type.LONG));

TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 10, sort);

System.out.println("按价格排序：");

for (ScoreDoc sd : topDocs.scoreDocs) {

Document doc = searcher.doc(sd.doc);

System.out.println("id=" + doc.get("id") +

", 价格=" + doc.get("price")); // 可以直接访问存储的字段值

}

reader.close();

dir.close();

}

```

---

✅ 输出结果

```

按价格排序：

id=1, 价格=1999

id=2, 价格=3999

```

---

✅ 总结

- DocValues：用于索引，支持高效排序、聚合和分面统计，但不存储原始值。

- StoredField：用于存储原始字段值，可在检索时直接访问，但不参与索引。

通过同时使用 `NumericDocValuesField` 和 `StoredField`，可以兼顾索引性能和原始值访问的需求。

查看全文

http://www.xdnf.cn/news/16626.html

AR技术赋能航空维修：精度与效率的飞跃

2025年数学与应用物理国际会议（ICMAP 2025）

FFmpeg：因码流采集与封装不同步导致录制出来的MP4文件会出现黑屏、绿屏的问题

【AI 数据管理】Text2SQL：当AI成为你和数据库之间的金牌“翻译官”

如何将JPG、PNG、GIF图像转换成PDF、SVG、EPS矢量图像

rabbitmq--默认模式（点对点）

应用药品 GMP 证书识别技术，实现证书信息的自动化、精准化提取与核验

【动态规划算法】斐波那契数列模型

Linux730 tr:-d /-s；sort:-r,-n,-R,-o,-t,-k,-u；bash；cut:-d,-c；tee -a；uniq -c -i

独立站如何吃掉平台蛋糕？DTC模式下的成本重构与利润跃升

sqli-labs：Less-6关卡详细解析

KONG API Gateway中的核心概念

图像处理中级篇 [1]—— 彩色照相机的效果与预处理

SpringBoot之整合SSM步骤

PHP语法高级篇(七)：MySQL数据库

[论文阅读] 人工智能 + 软件工程 | 增强RESTful API测试：针对MongoDB的搜索式模糊测试新方法

【LINUX网络】使用TCP简易通信

【STM32-HAL】 SPI通信与Flash数据写入实战

国产化再进一步，杰和科技推出搭载国产芯片的主板

代码随想录算法训练营第五十五天|图论part5

【音视频】WebRTC-Web 音视频采集与播放

如何利用 Redis 的原子操作（INCR, DECR）实现分布式计数器？

CSS-in-JS 动态主题切换与首屏渲染优化

IBM Watsonx BI：AI赋能的下一代商业智能平台

相关文章：