/* * Copyright 2023-2024 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * https://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.cnnlp.data.document; import java.util.HashMap; import java.util.Map; import java.util.Objects; import com.fasterxml.jackson.annotation.JsonCreator; //import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.fasterxml.jackson.annotation.JsonProperty; import org.cnnlp.data.util.id.IdGenerator; import org.cnnlp.data.util.id.RandomIdGenerator; //import org.springframework.ai.document.ContentFormatter; //import org.springframework.ai.document.DefaultContentFormatter; //import org.springframework.ai.document.MetadataMode; //import org.springframework.ai.document.id.IdGenerator; //import org.springframework.ai.document.id.RandomIdGenerator; //import org.springframework.ai.model.Media; //import org.springframework.lang.Nullable; //import org.springframework.util.Assert; //import org.springframework.util.StringUtils; /** * A document is a container for the content and metadata of a document. It also contains * the document's unique ID. * * A Document can hold either text content or media content, but not both. * * It is intended to be used to take data from external sources as part of spring-ai's ETL * pipeline. * *
* Example of creating a text document:
{@code
* // Using constructor
* Document textDoc = new Document("Sample text content", Map.of("source", "user-input"));
*
* // Using builder
* Document textDoc = Document.builder()
* .text("Sample text content")
* .metadata("source", "user-input")
* .build();
* }
*
* * Example of creating a media document:
{@code
* // Using constructor
* Media imageContent = new Media(MediaType.IMAGE_PNG, new byte[] {...});
* Document mediaDoc = new Document(imageContent, Map.of("filename", "sample.png"));
*
* // Using builder
* Document mediaDoc = Document.builder()
* .media(new Media(MediaType.IMAGE_PNG, new byte[] {...}))
* .metadata("filename", "sample.png")
* .build();
* }
*
* * Example of checking content type and accessing content:
{@code
* if (document.isText()) {
* String textContent = document.getText();
* // Process text content
* } else {
* Media mediaContent = document.getMedia();
* // Process media content
* }
* }
*/
//@JsonIgnoreProperties({ "contentFormatter", "embedding" })
public class GDocument {
// public static final ContentFormatter DEFAULT_CONTENT_FORMATTER = DefaultContentFormatter.defaultConfig();
/**
* Unique ID
*/
private final String id;
/**
* Document string content.
*/
private final String text;
/**
* Document media content
*/
// private final Media media;
/**
* Metadata for the document. It should not be nested and values should be restricted
* to string, int, float, boolean for simple use with Vector Dbs.
*/
private final Map* Common uses include: *
* Higher values typically indicate greater relevance or similarity.
*/
//@Nullable
private final Double score;
/**
* Mutable, ephemeral, content to text formatter. Defaults to Document text.
*/
//@JsonIgnore
//private ContentFormatter contentFormatter = DEFAULT_CONTENT_FORMATTER;
@JsonCreator(mode = JsonCreator.Mode.PROPERTIES)
public GDocument(@JsonProperty("content") String content) {
this(content, new HashMap<>());
}
public GDocument(String text, Map
* This ID is either explicitly provided during document creation or generated using
* the configured {@link IdGenerator} (defaults to {@link RandomIdGenerator}).
* @return the unique identifier of this document
* @see RandomIdGenerator
*/
public String getId() {
return this.id;
}
/**
* Returns the document's text content, if any.
* @return the text content
*/
//@Nullable
public String getText() {
return this.text;
}
/**
* Determines whether this document contains text or media content.
* @return true if this document contains text content (accessible via
* {@link #getText()}), false if it contains media content (accessible via
*/
// public boolean isText() {
// return this.text != null;
// }
/**
* Returns the metadata associated with this document.
*
* The metadata values are restricted to simple types (string, int, float, boolean)
* for compatibility with Vector Databases.
* @return the metadata map
*/
public Map
* Either text or media content must be set before building the document, but not
* both.
* @param text the text content
* @return the builder instance
*/
public Builder text(String text) {
this.text = text;
return this;
}
public Builder metadata(Map
* Common uses include:
*
* Higher values typically indicate greater relevance or similarity.
* @param score the document score, may be null
* @return the builder instance
*/
public Builder score(Double score) {
this.score = score;
return this;
}
public static boolean hasText(String str) {
return str != null && !str.isBlank();
}
public GDocument build() {
if (!hasText(this.id)) {
this.id = this.idGenerator.generateId(this.text, this.metadata);
}
return new GDocument(this.id, this.text, this.metadata, this.score);
}
}
}
*
*