GDocument.java 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332
  1. /*
  2. * Copyright 2023-2024 the original author or authors.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * https://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. package org.cnnlp.data.document;
  17. import java.util.HashMap;
  18. import java.util.Map;
  19. import java.util.Objects;
  20. import com.fasterxml.jackson.annotation.JsonCreator;
  21. //import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
  22. import com.fasterxml.jackson.annotation.JsonProperty;
  23. import org.cnnlp.data.util.id.IdGenerator;
  24. import org.cnnlp.data.util.id.RandomIdGenerator;
  25. //import org.springframework.ai.document.ContentFormatter;
  26. //import org.springframework.ai.document.DefaultContentFormatter;
  27. //import org.springframework.ai.document.MetadataMode;
  28. //import org.springframework.ai.document.id.IdGenerator;
  29. //import org.springframework.ai.document.id.RandomIdGenerator;
  30. //import org.springframework.ai.model.Media;
  31. //import org.springframework.lang.Nullable;
  32. //import org.springframework.util.Assert;
  33. //import org.springframework.util.StringUtils;
  34. /**
  35. * A document is a container for the content and metadata of a document. It also contains
  36. * the document's unique ID.
  37. *
  38. * A Document can hold either text content or media content, but not both.
  39. *
  40. * It is intended to be used to take data from external sources as part of spring-ai's ETL
  41. * pipeline.
  42. *
  43. * <p>
  44. * Example of creating a text document: <pre>{@code
  45. * // Using constructor
  46. * Document textDoc = new Document("Sample text content", Map.of("source", "user-input"));
  47. *
  48. * // Using builder
  49. * Document textDoc = Document.builder()
  50. * .text("Sample text content")
  51. * .metadata("source", "user-input")
  52. * .build();
  53. * }</pre>
  54. *
  55. * <p>
  56. * Example of creating a media document: <pre>{@code
  57. * // Using constructor
  58. * Media imageContent = new Media(MediaType.IMAGE_PNG, new byte[] {...});
  59. * Document mediaDoc = new Document(imageContent, Map.of("filename", "sample.png"));
  60. *
  61. * // Using builder
  62. * Document mediaDoc = Document.builder()
  63. * .media(new Media(MediaType.IMAGE_PNG, new byte[] {...}))
  64. * .metadata("filename", "sample.png")
  65. * .build();
  66. * }</pre>
  67. *
  68. * <p>
  69. * Example of checking content type and accessing content: <pre>{@code
  70. * if (document.isText()) {
  71. * String textContent = document.getText();
  72. * // Process text content
  73. * } else {
  74. * Media mediaContent = document.getMedia();
  75. * // Process media content
  76. * }
  77. * }</pre>
  78. */
  79. //@JsonIgnoreProperties({ "contentFormatter", "embedding" })
  80. public class GDocument {
  81. // public static final ContentFormatter DEFAULT_CONTENT_FORMATTER = DefaultContentFormatter.defaultConfig();
  82. /**
  83. * Unique ID
  84. */
  85. private final String id;
  86. /**
  87. * Document string content.
  88. */
  89. private final String text;
  90. /**
  91. * Document media content
  92. */
  93. // private final Media media;
  94. /**
  95. * Metadata for the document. It should not be nested and values should be restricted
  96. * to string, int, float, boolean for simple use with Vector Dbs.
  97. */
  98. private final Map<String, Object> metadata;
  99. /**
  100. * A numeric score associated with this document that can represent various types of
  101. * relevance measures.
  102. * <p>
  103. * Common uses include:
  104. * <ul>
  105. * <li>Measure of similarity between the embedding value of the document's text/media
  106. * and a query vector, where higher scores indicate greater similarity (opposite of
  107. * distance measure)
  108. * <li>Text relevancy rankings from retrieval systems
  109. * <li>Custom relevancy metrics from RAG patterns
  110. * </ul>
  111. * <p>
  112. * Higher values typically indicate greater relevance or similarity.
  113. */
  114. //@Nullable
  115. private final Double score;
  116. /**
  117. * Mutable, ephemeral, content to text formatter. Defaults to Document text.
  118. */
  119. //@JsonIgnore
  120. //private ContentFormatter contentFormatter = DEFAULT_CONTENT_FORMATTER;
  121. @JsonCreator(mode = JsonCreator.Mode.PROPERTIES)
  122. public GDocument(@JsonProperty("content") String content) {
  123. this(content, new HashMap<>());
  124. }
  125. public GDocument(String text, Map<String, Object> metadata) {
  126. this(new RandomIdGenerator().generateId(), text, metadata, null);
  127. }
  128. public GDocument(String id, String text, Map<String, Object> metadata) {
  129. this(id, text, metadata, null);
  130. }
  131. // public Document(Media media, Map<String, Object> metadata) {
  132. // this(new RandomIdGenerator().generateId(), null, media, metadata, null);
  133. // }
  134. //
  135. // public Document(String id, Media media, Map<String, Object> metadata) {
  136. // this(id, null, media, metadata, null);
  137. // }
  138. private GDocument(String id, String text, Map<String, Object> metadata, Double score) {
  139. // Assert.hasText(id, "id cannot be null or empty");
  140. // Assert.notNull(metadata, "metadata cannot be null");
  141. // Assert.noNullElements(metadata.keySet(), "metadata cannot have null keys");
  142. // Assert.noNullElements(metadata.values(), "metadata cannot have null values");
  143. // Assert.isTrue(text != null ^ media != null, "exactly one of text or media must be specified");
  144. this.id = id;
  145. this.text = text;
  146. // this.media = media;
  147. this.metadata = new HashMap<>(metadata);
  148. this.score = score;
  149. }
  150. public static Builder builder() {
  151. return new Builder();
  152. }
  153. /**
  154. * Returns the unique identifier for this document.
  155. * <p>
  156. * This ID is either explicitly provided during document creation or generated using
  157. * the configured {@link IdGenerator} (defaults to {@link RandomIdGenerator}).
  158. * @return the unique identifier of this document
  159. * @see RandomIdGenerator
  160. */
  161. public String getId() {
  162. return this.id;
  163. }
  164. /**
  165. * Returns the document's text content, if any.
  166. * @return the text content
  167. */
  168. //@Nullable
  169. public String getText() {
  170. return this.text;
  171. }
  172. /**
  173. * Determines whether this document contains text or media content.
  174. * @return true if this document contains text content (accessible via
  175. * {@link #getText()}), false if it contains media content (accessible via
  176. */
  177. // public boolean isText() {
  178. // return this.text != null;
  179. // }
  180. /**
  181. * Returns the metadata associated with this document.
  182. * <p>
  183. * The metadata values are restricted to simple types (string, int, float, boolean)
  184. * for compatibility with Vector Databases.
  185. * @return the metadata map
  186. */
  187. public Map<String, Object> getMetadata() {
  188. return this.metadata;
  189. }
  190. public Double getScore() {
  191. return this.score;
  192. }
  193. public Builder mutate() {
  194. return new Builder().id(this.id).text(this.text).metadata(this.metadata).score(this.score);
  195. }
  196. @Override
  197. public boolean equals(Object o) {
  198. if (o == null || this.getClass() != o.getClass()) {
  199. return false;
  200. }
  201. GDocument document = (GDocument) o;
  202. return Objects.equals(this.id, document.id) && Objects.equals(this.text, document.text)
  203. && Objects.equals(this.metadata, document.metadata)
  204. && Objects.equals(this.score, document.score);
  205. }
  206. @Override
  207. public int hashCode() {
  208. return Objects.hash(this.id, this.text, this.metadata, this.score);
  209. }
  210. @Override
  211. public String toString() {
  212. return "Document{" + "id='" + this.id + '\'' + ", text='" + this.text + '\''
  213. + ", metadata=" + this.metadata + ", score=" + this.score + '}';
  214. }
  215. public static class Builder {
  216. private String id;
  217. private String text;
  218. private Map<String, Object> metadata = new HashMap<>();
  219. private Double score;
  220. private IdGenerator idGenerator = new RandomIdGenerator();
  221. public Builder idGenerator(IdGenerator idGenerator) {
  222. //Assert.notNull(idGenerator, "idGenerator cannot be null");
  223. this.idGenerator = idGenerator;
  224. return this;
  225. }
  226. public Builder id(String id) {
  227. //Assert.hasText(id, "id cannot be null or empty");
  228. this.id = id;
  229. return this;
  230. }
  231. /**
  232. * Sets the text content of the document.
  233. * <p>
  234. * Either text or media content must be set before building the document, but not
  235. * both.
  236. * @param text the text content
  237. * @return the builder instance
  238. */
  239. public Builder text(String text) {
  240. this.text = text;
  241. return this;
  242. }
  243. public Builder metadata(Map<String, Object> metadata) {
  244. //Assert.notNull(metadata, "metadata cannot be null");
  245. this.metadata = metadata;
  246. return this;
  247. }
  248. public Builder metadata(String key, Object value) {
  249. //Assert.notNull(key, "metadata key cannot be null");
  250. //Assert.notNull(value, "metadata value cannot be null");
  251. this.metadata.put(key, value);
  252. return this;
  253. }
  254. /**
  255. * Sets a score value for this document.
  256. * <p>
  257. * Common uses include:
  258. * <ul>
  259. * <li>Measure of similarity between the embedding value of the document's
  260. * text/media and a query vector, where higher scores indicate greater similarity
  261. * (opposite of distance measure)
  262. * <li>Text relevancy rankings from retrieval systems
  263. * <li>Custom relevancy metrics from RAG patterns
  264. * </ul>
  265. * <p>
  266. * Higher values typically indicate greater relevance or similarity.
  267. * @param score the document score, may be null
  268. * @return the builder instance
  269. */
  270. public Builder score(Double score) {
  271. this.score = score;
  272. return this;
  273. }
  274. public static boolean hasText(String str) {
  275. return str != null && !str.isBlank();
  276. }
  277. public GDocument build() {
  278. if (!hasText(this.id)) {
  279. this.id = this.idGenerator.generateId(this.text, this.metadata);
  280. }
  281. return new GDocument(this.id, this.text, this.metadata, this.score);
  282. }
  283. }
  284. }