Spring AI ETL Keyword Metadata Enricher Document Transformer

Sometimes the requirement is to extract important keywords from large documents for metadata, to solve this, Spring AI ETL Keyword Metadata Enricher Document Transformer uses a generative AI model to extract keywords from large document content and add them as metadata.

The Keyword Metadata Enricher default prompt is Give %s unique keywords for this document. Format as comma-separated Keywords

package com.example.springai.controller;

import org.springframework.ai.chat.model.ChatModel;
import org.springframework.ai.document.Document;
import org.springframework.ai.reader.TextReader;
import org.springframework.ai.transformer.KeywordMetadataEnricher;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.Resource;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RestController;

import java.io.IOException;
import java.time.Instant;
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.util.List;
import java.util.Map;

@RestController
public class SpringAiController {
    private final Resource apples;
    @Autowired
    private ChatModel chatModel;

    public SpringAiController(@Value("classpath:apples.st") Resource apples) {
        this.apples = apples;
    }

    @GetMapping("/textDocument")
    List<Document> textDocument() throws IOException {
        var textReader = new TextReader(apples);
        textReader.getCustomMetadata()
                .putAll(Map.of("length", apples.contentLength(), "last modified", LocalDateTime.ofInstant(Instant.ofEpochMilli(apples.lastModified()), ZoneId.systemDefault())));
        var documents = textReader.get();
        TokenTextSplitter splitter = new TokenTextSplitter(true);
        return keywordMetadataEnricher(splitter.apply(documents));
    }

    List<Document> keywordMetadataEnricher(List<Document> documents) {
        KeywordMetadataEnricher keywordMetadataEnricher = new KeywordMetadataEnricher(chatModel, 5);
        return keywordMetadataEnricher.apply(documents);
    }
}
package com.example.springai;

import org.springframework.ai.chat.model.ChatModel;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;

@SpringBootApplication
public class SpringAiApplication {
    @Autowired
    private ChatModel chatModel;

    public static void main(String[] args) {
        SpringApplication.run(SpringAiApplication.class, args);
    }
}

create a file apples.st in src/main/resources directory

spring.application.name=SpringAi
spring.docker.compose.lifecycle-management=start-only
spring.threads.virtual.enabled=true
# The default Ollama Model in Spring Ai is mistral, but it can be changed by setting the below property. make sure to download the same model in entrypoint.sh file
#spring.ai.ollama.chat.options.model=llama3.1
# If running the Ollama Docker Instance separately, then set this property
spring.docker.compose.enabled=false
services:
  ollama-model:
    image: ollama/ollama:latest
    container_name: ollama_container
    ports:
      - 11434:11434/tcp
    healthcheck:
      test: ollama --version || exit 1
    command: serve
    volumes:
      - ./ollama/ollama:/root/.ollama
      - ./entrypoint.sh:/entrypoint.sh
    pull_policy: missing
    tty: true
    restart: no
    entrypoint: [ "/usr/bin/bash", "/entrypoint.sh" ]

  open-webui:
    image: ghcr.io/open-webui/open-webui:main
    container_name: open_webui_container
    environment:
      WEBUI_AUTH: false
    ports:
      - "8081:8080"
    extra_hosts:
      - "host.docker.internal:host-gateway"
    volumes:
      - open-webui:/app/backend/data
    restart: no

volumes:
  open-webui:
#!/bin/bash
# Start Ollama in the background.
/bin/ollama serve &
# Record Process ID.
pid=$!
# Pause for Ollama to start.
sleep 5
# The default Ollama Model in Spring Ai is mistral, but it can be changed in the applications property file. Make sure to download the same Model here
echo "🔴 Retrieve LLAMA3 model..."
ollama pull mistral
echo "🟢 Done!"
# Wait for the Ollama process to finish.
wait $pid
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://maven.apache.org/POM/4.0.0"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>3.3.2</version>
        <relativePath/>
    </parent>
    <groupId>com.example.springai</groupId>
    <artifactId>etl_pipeline_text_split_keyword_metadata_enricher</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <name>ETL Pipeline Text Split Keyword Metadata Enricher</name>
    <description>Demo project for Spring Boot</description>

    <properties>
        <java.version>21</java.version>
        <spring-ai.version>1.0.0-SNAPSHOT</spring-ai.version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.ai</groupId>
            <artifactId>spring-ai-ollama-spring-boot-starter</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-docker-compose</artifactId>
            <scope>runtime</scope>
            <optional>true</optional>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-test</artifactId>
            <scope>test</scope>
        </dependency>
    </dependencies>
    <dependencyManagement>
        <dependencies>
            <dependency>
                <groupId>org.springframework.ai</groupId>
                <artifactId>spring-ai-bom</artifactId>
                <version>${spring-ai.version}</version>
                <type>pom</type>
                <scope>import</scope>
            </dependency>
        </dependencies>
    </dependencyManagement>

    <build>
        <plugins>
            <plugin>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-maven-plugin</artifactId>
                <configuration>
                    <mainClass>com.example.springai.SpringAiApplication</mainClass>
                    <excludes>
                        <exclude>
                            <groupId>org.projectlombok</groupId>
                            <artifactId>lombok</artifactId>
                        </exclude>
                    </excludes>
                </configuration>
            </plugin>
        </plugins>
    </build>
    <repositories>
        <repository>
            <id>spring-milestones</id>
            <name>Spring Milestones</name>
            <url>https://repo.spring.io/milestone</url>
            <snapshots>
                <enabled>false</enabled>
            </snapshots>
        </repository>
        <repository>
            <id>spring-snapshots</id>
            <name>Spring Snapshots</name>
            <url>https://repo.spring.io/snapshot</url>
            <releases>
                <enabled>false</enabled>
            </releases>
        </repository>
    </repositories>
</project>

Run the curl to see the Spring AI ETL Keyword Metadata Enricher Document Transformer

http://localhost:8080/textDocument

follow us on