| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332 |
- /*
- * Copyright 2023-2024 the original author or authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.cnnlp.data.document;
- import java.util.HashMap;
- import java.util.Map;
- import java.util.Objects;
- import com.fasterxml.jackson.annotation.JsonCreator;
- //import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
- import com.fasterxml.jackson.annotation.JsonProperty;
- import org.cnnlp.data.util.id.IdGenerator;
- import org.cnnlp.data.util.id.RandomIdGenerator;
- //import org.springframework.ai.document.ContentFormatter;
- //import org.springframework.ai.document.DefaultContentFormatter;
- //import org.springframework.ai.document.MetadataMode;
- //import org.springframework.ai.document.id.IdGenerator;
- //import org.springframework.ai.document.id.RandomIdGenerator;
- //import org.springframework.ai.model.Media;
- //import org.springframework.lang.Nullable;
- //import org.springframework.util.Assert;
- //import org.springframework.util.StringUtils;
- /**
- * A document is a container for the content and metadata of a document. It also contains
- * the document's unique ID.
- *
- * A Document can hold either text content or media content, but not both.
- *
- * It is intended to be used to take data from external sources as part of spring-ai's ETL
- * pipeline.
- *
- * <p>
- * Example of creating a text document: <pre>{@code
- * // Using constructor
- * Document textDoc = new Document("Sample text content", Map.of("source", "user-input"));
- *
- * // Using builder
- * Document textDoc = Document.builder()
- * .text("Sample text content")
- * .metadata("source", "user-input")
- * .build();
- * }</pre>
- *
- * <p>
- * Example of creating a media document: <pre>{@code
- * // Using constructor
- * Media imageContent = new Media(MediaType.IMAGE_PNG, new byte[] {...});
- * Document mediaDoc = new Document(imageContent, Map.of("filename", "sample.png"));
- *
- * // Using builder
- * Document mediaDoc = Document.builder()
- * .media(new Media(MediaType.IMAGE_PNG, new byte[] {...}))
- * .metadata("filename", "sample.png")
- * .build();
- * }</pre>
- *
- * <p>
- * Example of checking content type and accessing content: <pre>{@code
- * if (document.isText()) {
- * String textContent = document.getText();
- * // Process text content
- * } else {
- * Media mediaContent = document.getMedia();
- * // Process media content
- * }
- * }</pre>
- */
- //@JsonIgnoreProperties({ "contentFormatter", "embedding" })
- public class GDocument {
- // public static final ContentFormatter DEFAULT_CONTENT_FORMATTER = DefaultContentFormatter.defaultConfig();
- /**
- * Unique ID
- */
- private final String id;
- /**
- * Document string content.
- */
- private final String text;
- /**
- * Document media content
- */
- // private final Media media;
- /**
- * Metadata for the document. It should not be nested and values should be restricted
- * to string, int, float, boolean for simple use with Vector Dbs.
- */
- private final Map<String, Object> metadata;
- /**
- * A numeric score associated with this document that can represent various types of
- * relevance measures.
- * <p>
- * Common uses include:
- * <ul>
- * <li>Measure of similarity between the embedding value of the document's text/media
- * and a query vector, where higher scores indicate greater similarity (opposite of
- * distance measure)
- * <li>Text relevancy rankings from retrieval systems
- * <li>Custom relevancy metrics from RAG patterns
- * </ul>
- * <p>
- * Higher values typically indicate greater relevance or similarity.
- */
- //@Nullable
- private final Double score;
- /**
- * Mutable, ephemeral, content to text formatter. Defaults to Document text.
- */
- //@JsonIgnore
- //private ContentFormatter contentFormatter = DEFAULT_CONTENT_FORMATTER;
- @JsonCreator(mode = JsonCreator.Mode.PROPERTIES)
- public GDocument(@JsonProperty("content") String content) {
- this(content, new HashMap<>());
- }
- public GDocument(String text, Map<String, Object> metadata) {
- this(new RandomIdGenerator().generateId(), text, metadata, null);
- }
- public GDocument(String id, String text, Map<String, Object> metadata) {
- this(id, text, metadata, null);
- }
- // public Document(Media media, Map<String, Object> metadata) {
- // this(new RandomIdGenerator().generateId(), null, media, metadata, null);
- // }
- //
- // public Document(String id, Media media, Map<String, Object> metadata) {
- // this(id, null, media, metadata, null);
- // }
- private GDocument(String id, String text, Map<String, Object> metadata, Double score) {
- // Assert.hasText(id, "id cannot be null or empty");
- // Assert.notNull(metadata, "metadata cannot be null");
- // Assert.noNullElements(metadata.keySet(), "metadata cannot have null keys");
- // Assert.noNullElements(metadata.values(), "metadata cannot have null values");
- // Assert.isTrue(text != null ^ media != null, "exactly one of text or media must be specified");
- this.id = id;
- this.text = text;
- // this.media = media;
- this.metadata = new HashMap<>(metadata);
- this.score = score;
- }
- public static Builder builder() {
- return new Builder();
- }
- /**
- * Returns the unique identifier for this document.
- * <p>
- * This ID is either explicitly provided during document creation or generated using
- * the configured {@link IdGenerator} (defaults to {@link RandomIdGenerator}).
- * @return the unique identifier of this document
- * @see RandomIdGenerator
- */
- public String getId() {
- return this.id;
- }
- /**
- * Returns the document's text content, if any.
- * @return the text content
- */
- //@Nullable
- public String getText() {
- return this.text;
- }
- /**
- * Determines whether this document contains text or media content.
- * @return true if this document contains text content (accessible via
- * {@link #getText()}), false if it contains media content (accessible via
- */
- // public boolean isText() {
- // return this.text != null;
- // }
- /**
- * Returns the metadata associated with this document.
- * <p>
- * The metadata values are restricted to simple types (string, int, float, boolean)
- * for compatibility with Vector Databases.
- * @return the metadata map
- */
- public Map<String, Object> getMetadata() {
- return this.metadata;
- }
- public Double getScore() {
- return this.score;
- }
- public Builder mutate() {
- return new Builder().id(this.id).text(this.text).metadata(this.metadata).score(this.score);
- }
- @Override
- public boolean equals(Object o) {
- if (o == null || this.getClass() != o.getClass()) {
- return false;
- }
- GDocument document = (GDocument) o;
- return Objects.equals(this.id, document.id) && Objects.equals(this.text, document.text)
- && Objects.equals(this.metadata, document.metadata)
- && Objects.equals(this.score, document.score);
- }
- @Override
- public int hashCode() {
- return Objects.hash(this.id, this.text, this.metadata, this.score);
- }
- @Override
- public String toString() {
- return "Document{" + "id='" + this.id + '\'' + ", text='" + this.text + '\''
- + ", metadata=" + this.metadata + ", score=" + this.score + '}';
- }
- public static class Builder {
- private String id;
- private String text;
- private Map<String, Object> metadata = new HashMap<>();
- private Double score;
- private IdGenerator idGenerator = new RandomIdGenerator();
- public Builder idGenerator(IdGenerator idGenerator) {
- //Assert.notNull(idGenerator, "idGenerator cannot be null");
- this.idGenerator = idGenerator;
- return this;
- }
- public Builder id(String id) {
- //Assert.hasText(id, "id cannot be null or empty");
- this.id = id;
- return this;
- }
- /**
- * Sets the text content of the document.
- * <p>
- * Either text or media content must be set before building the document, but not
- * both.
- * @param text the text content
- * @return the builder instance
- */
- public Builder text(String text) {
- this.text = text;
- return this;
- }
- public Builder metadata(Map<String, Object> metadata) {
- //Assert.notNull(metadata, "metadata cannot be null");
- this.metadata = metadata;
- return this;
- }
- public Builder metadata(String key, Object value) {
- //Assert.notNull(key, "metadata key cannot be null");
- //Assert.notNull(value, "metadata value cannot be null");
- this.metadata.put(key, value);
- return this;
- }
- /**
- * Sets a score value for this document.
- * <p>
- * Common uses include:
- * <ul>
- * <li>Measure of similarity between the embedding value of the document's
- * text/media and a query vector, where higher scores indicate greater similarity
- * (opposite of distance measure)
- * <li>Text relevancy rankings from retrieval systems
- * <li>Custom relevancy metrics from RAG patterns
- * </ul>
- * <p>
- * Higher values typically indicate greater relevance or similarity.
- * @param score the document score, may be null
- * @return the builder instance
- */
- public Builder score(Double score) {
- this.score = score;
- return this;
- }
- public static boolean hasText(String str) {
- return str != null && !str.isBlank();
- }
- public GDocument build() {
- if (!hasText(this.id)) {
- this.id = this.idGenerator.generateId(this.text, this.metadata);
- }
- return new GDocument(this.id, this.text, this.metadata, this.score);
- }
- }
- }
|