open evalsopen evalsopen evalsopen evalsopen evalsopen evalsopen evalsopen evalsopen evalsopen evalsopen evals

pnpm add @open-evals/core

Open Evals

An open-source framework for evaluating and testing LLM applications with built-in metrics and synthetic data generation.


import { EvaluationDataset, evaluate } from '@open-evals/core'
import { Faithfulness } from '@open-evals/metrics'
import { synthesize, graph, DocumentNode, chunk, embed, summarize, relationship } from '@open-evals/generator'
import { openai } from '@ai-sdk/openai'

const documents = [new DocumentNode('typescript-guide.md', content, {})]

const knowledgeGraph = await transform(graph(documents))
  .pipe(summarize(openai.chat('gpt-4.1')))
  .pipe(embedProperty(openai.embedding('text-embedding-3-small'), {
    embedProperty: 'summary',
    propertyName: 'summaryEmbedding',
  }))
  .pipe(chunk(new RecursiveCharacterSplitter()))
  .pipe(embed(openai.embedding('text-embedding-3-small')))
  .pipe(relationship())
  .apply()

const personas = await generatePersonas(knowledgeGraph, openai.chat('gpt-4.1'), {
  count: 5,
})

const dataset = await synthesize({
  graph: knowledgeGraph,
  synthesizers: [
    [createSynthesizer(openai.chat('gpt-4.1'), 'single-hop-specific'), 1],
  ],
  personas,
  count: 10,
})

const results = await evaluate(dataset, 
  [new Faithfulness({ model: openai.chat('gpt-4.1') })],
  openai.chat('gpt-4.1'))

console.log(results)

import { EvaluationDataset, evaluate } from '@open-evals/core'import { Faithfulness } from '@open-evals/metrics'import { synthesize, graph, DocumentNode, chunk, embed, summarize, relationship } from '@open-evals/generator'import { openai } from '@ai-sdk/openai'const documents = [new DocumentNode('typescript-guide.md', content, {})]const knowledgeGraph = await transform(graph(documents))  .pipe(summarize(openai.chat('gpt-4.1')))  .pipe(embedProperty(openai.embedding('text-embedding-3-small'), {    embedProperty: 'summary',    propertyName: 'summaryEmbedding',  }))  .pipe(chunk(new RecursiveCharacterSplitter()))  .pipe(embed(openai.embedding('text-embedding-3-small')))  .pipe(relationship())  .apply()const personas = await generatePersonas(knowledgeGraph, openai.chat('gpt-4.1'), {  count: 5,})const dataset = await synthesize({  graph: knowledgeGraph,  synthesizers: [    [createSynthesizer(openai.chat('gpt-4.1'), 'single-hop-specific'), 1],  ],  personas,  count: 10,})const results = await evaluate(dataset,   [new Faithfulness({ model: openai.chat('gpt-4.1') })],  openai.chat('gpt-4.1'))console.log(results)

featuresfeaturesfeaturesfeaturesfeaturesfeaturesfeaturesfeaturesfeaturesfeaturesfeaturesfeatures

Synthetic Data Generation

Automatically generate realistic test data from your domain knowledge.

Metrics

Built-in metrics for evaluating the quality of your LLM and RAG applications.

Evaluation Framework

A flexible evaluation framework for your LLM and RAG applications.

Rag Utilities

Utilities for working with RAG applications, including document splitters.