/* * Copyright 2023-2024 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * https://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.cnnlp.data.document; import java.util.HashMap; import java.util.Map; import java.util.Objects; import com.fasterxml.jackson.annotation.JsonCreator; //import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.fasterxml.jackson.annotation.JsonProperty; import org.cnnlp.data.util.id.IdGenerator; import org.cnnlp.data.util.id.RandomIdGenerator; //import org.springframework.ai.document.ContentFormatter; //import org.springframework.ai.document.DefaultContentFormatter; //import org.springframework.ai.document.MetadataMode; //import org.springframework.ai.document.id.IdGenerator; //import org.springframework.ai.document.id.RandomIdGenerator; //import org.springframework.ai.model.Media; //import org.springframework.lang.Nullable; //import org.springframework.util.Assert; //import org.springframework.util.StringUtils; /** * A document is a container for the content and metadata of a document. It also contains * the document's unique ID. * * A Document can hold either text content or media content, but not both. * * It is intended to be used to take data from external sources as part of spring-ai's ETL * pipeline. * *

* Example of creating a text document:

{@code
 * // Using constructor
 * Document textDoc = new Document("Sample text content", Map.of("source", "user-input"));
 *
 * // Using builder
 * Document textDoc = Document.builder()
 *     .text("Sample text content")
 *     .metadata("source", "user-input")
 *     .build();
 * }
* *

* Example of creating a media document:

{@code
 * // Using constructor
 * Media imageContent = new Media(MediaType.IMAGE_PNG, new byte[] {...});
 * Document mediaDoc = new Document(imageContent, Map.of("filename", "sample.png"));
 *
 * // Using builder
 * Document mediaDoc = Document.builder()
 *     .media(new Media(MediaType.IMAGE_PNG, new byte[] {...}))
 *     .metadata("filename", "sample.png")
 *     .build();
 * }
* *

* Example of checking content type and accessing content:

{@code
 * if (document.isText()) {
 *     String textContent = document.getText();
 *     // Process text content
 * } else {
 *     Media mediaContent = document.getMedia();
 *     // Process media content
 * }
 * }
*/ //@JsonIgnoreProperties({ "contentFormatter", "embedding" }) public class GDocument { // public static final ContentFormatter DEFAULT_CONTENT_FORMATTER = DefaultContentFormatter.defaultConfig(); /** * Unique ID */ private final String id; /** * Document string content. */ private final String text; /** * Document media content */ // private final Media media; /** * Metadata for the document. It should not be nested and values should be restricted * to string, int, float, boolean for simple use with Vector Dbs. */ private final Map metadata; /** * A numeric score associated with this document that can represent various types of * relevance measures. *

* Common uses include: *

*

* Higher values typically indicate greater relevance or similarity. */ //@Nullable private final Double score; /** * Mutable, ephemeral, content to text formatter. Defaults to Document text. */ //@JsonIgnore //private ContentFormatter contentFormatter = DEFAULT_CONTENT_FORMATTER; @JsonCreator(mode = JsonCreator.Mode.PROPERTIES) public GDocument(@JsonProperty("content") String content) { this(content, new HashMap<>()); } public GDocument(String text, Map metadata) { this(new RandomIdGenerator().generateId(), text, metadata, null); } public GDocument(String id, String text, Map metadata) { this(id, text, metadata, null); } // public Document(Media media, Map metadata) { // this(new RandomIdGenerator().generateId(), null, media, metadata, null); // } // // public Document(String id, Media media, Map metadata) { // this(id, null, media, metadata, null); // } private GDocument(String id, String text, Map metadata, Double score) { // Assert.hasText(id, "id cannot be null or empty"); // Assert.notNull(metadata, "metadata cannot be null"); // Assert.noNullElements(metadata.keySet(), "metadata cannot have null keys"); // Assert.noNullElements(metadata.values(), "metadata cannot have null values"); // Assert.isTrue(text != null ^ media != null, "exactly one of text or media must be specified"); this.id = id; this.text = text; // this.media = media; this.metadata = new HashMap<>(metadata); this.score = score; } public static Builder builder() { return new Builder(); } /** * Returns the unique identifier for this document. *

* This ID is either explicitly provided during document creation or generated using * the configured {@link IdGenerator} (defaults to {@link RandomIdGenerator}). * @return the unique identifier of this document * @see RandomIdGenerator */ public String getId() { return this.id; } /** * Returns the document's text content, if any. * @return the text content */ //@Nullable public String getText() { return this.text; } /** * Determines whether this document contains text or media content. * @return true if this document contains text content (accessible via * {@link #getText()}), false if it contains media content (accessible via */ // public boolean isText() { // return this.text != null; // } /** * Returns the metadata associated with this document. *

* The metadata values are restricted to simple types (string, int, float, boolean) * for compatibility with Vector Databases. * @return the metadata map */ public Map getMetadata() { return this.metadata; } public Double getScore() { return this.score; } public Builder mutate() { return new Builder().id(this.id).text(this.text).metadata(this.metadata).score(this.score); } @Override public boolean equals(Object o) { if (o == null || this.getClass() != o.getClass()) { return false; } GDocument document = (GDocument) o; return Objects.equals(this.id, document.id) && Objects.equals(this.text, document.text) && Objects.equals(this.metadata, document.metadata) && Objects.equals(this.score, document.score); } @Override public int hashCode() { return Objects.hash(this.id, this.text, this.metadata, this.score); } @Override public String toString() { return "Document{" + "id='" + this.id + '\'' + ", text='" + this.text + '\'' + ", metadata=" + this.metadata + ", score=" + this.score + '}'; } public static class Builder { private String id; private String text; private Map metadata = new HashMap<>(); private Double score; private IdGenerator idGenerator = new RandomIdGenerator(); public Builder idGenerator(IdGenerator idGenerator) { //Assert.notNull(idGenerator, "idGenerator cannot be null"); this.idGenerator = idGenerator; return this; } public Builder id(String id) { //Assert.hasText(id, "id cannot be null or empty"); this.id = id; return this; } /** * Sets the text content of the document. *

* Either text or media content must be set before building the document, but not * both. * @param text the text content * @return the builder instance */ public Builder text(String text) { this.text = text; return this; } public Builder metadata(Map metadata) { //Assert.notNull(metadata, "metadata cannot be null"); this.metadata = metadata; return this; } public Builder metadata(String key, Object value) { //Assert.notNull(key, "metadata key cannot be null"); //Assert.notNull(value, "metadata value cannot be null"); this.metadata.put(key, value); return this; } /** * Sets a score value for this document. *

* Common uses include: *

*

* Higher values typically indicate greater relevance or similarity. * @param score the document score, may be null * @return the builder instance */ public Builder score(Double score) { this.score = score; return this; } public static boolean hasText(String str) { return str != null && !str.isBlank(); } public GDocument build() { if (!hasText(this.id)) { this.id = this.idGenerator.generateId(this.text, this.metadata); } return new GDocument(this.id, this.text, this.metadata, this.score); } } }