This module provides functionality for creating, loading, saving, and evaluating datasets of test cases.
Each case must have inputs, and can optionally have a name, expected output, metadata, and case-specific evaluators.
Datasets can be loaded from and saved to YAML or JSON files, and can be evaluated against
a task function to produce an evaluation report.
Each case represents a single test scenario with inputs to test. A case may optionally specify a name, expected
outputs to compare against, and arbitrary metadata.
Cases can also have their own specific evaluators which are run in addition to dataset-level evaluators.
@dataclass(init=False)classCase(Generic[InputsT,OutputT,MetadataT]):"""A single row of a [`Dataset`][pydantic_evals.Dataset]. Each case represents a single test scenario with inputs to test. A case may optionally specify a name, expected outputs to compare against, and arbitrary metadata. Cases can also have their own specific evaluators which are run in addition to dataset-level evaluators. Example: ```python from pydantic_evals import Case case = Case( name='Simple addition', inputs={'a': 1, 'b': 2}, expected_output=3, metadata={'description': 'Tests basic addition'}, ) ``` """name:str|None"""Name of the case. This is used to identify the case in the report and can be used to filter cases."""inputs:InputsT"""Inputs to the task. This is the input to the task that will be evaluated."""metadata:MetadataT|None=None"""Metadata to be used in the evaluation. This can be used to provide additional information about the case to the evaluators. """expected_output:OutputT|None=None"""Expected output of the task. This is the expected output of the task that will be evaluated."""evaluators:list[Evaluator[InputsT,OutputT,MetadataT]]=field(default_factory=list)"""Evaluators to be used just on this case."""def__init__(self,*,name:str|None=None,inputs:InputsT,metadata:MetadataT|None=None,expected_output:OutputT|None=None,evaluators:tuple[Evaluator[InputsT,OutputT,MetadataT],...]=(),):"""Initialize a new test case. Args: name: Optional name for the case. If not provided, a generic name will be assigned when added to a dataset. inputs: The inputs to the task being evaluated. metadata: Optional metadata for the case, which can be used by evaluators. expected_output: Optional expected output of the task, used for comparison in evaluators. evaluators: Tuple of evaluators specific to this case. These are in addition to any dataset-level evaluators. """# Note: `evaluators` must be a tuple instead of Sequence due to misbehavior with pyright's generic parameter# inference if it has type `Sequence`self.name=nameself.inputs=inputsself.metadata=metadataself.expected_output=expected_outputself.evaluators=list(evaluators)
def__init__(self,*,name:str|None=None,inputs:InputsT,metadata:MetadataT|None=None,expected_output:OutputT|None=None,evaluators:tuple[Evaluator[InputsT,OutputT,MetadataT],...]=(),):"""Initialize a new test case. Args: name: Optional name for the case. If not provided, a generic name will be assigned when added to a dataset. inputs: The inputs to the task being evaluated. metadata: Optional metadata for the case, which can be used by evaluators. expected_output: Optional expected output of the task, used for comparison in evaluators. evaluators: Tuple of evaluators specific to this case. These are in addition to any dataset-level evaluators. """# Note: `evaluators` must be a tuple instead of Sequence due to misbehavior with pyright's generic parameter# inference if it has type `Sequence`self.name=nameself.inputs=inputsself.metadata=metadataself.expected_output=expected_outputself.evaluators=list(evaluators)
Datasets allow you to organize a collection of test cases and evaluate them against a task function.
They can be loaded from and saved to YAML or JSON files, and can have dataset-level evaluators that
apply to all cases.
Example:
# Create a dataset with two test casesfromdataclassesimportdataclassfrompydantic_evalsimportCase,Datasetfrompydantic_evals.evaluatorsimportEvaluator,EvaluatorContext@dataclassclassExactMatch(Evaluator):defevaluate(self,ctx:EvaluatorContext)->bool:returnctx.output==ctx.expected_outputdataset=Dataset(cases=[Case(name='test1',inputs={'text':'Hello'},expected_output='HELLO'),Case(name='test2',inputs={'text':'World'},expected_output='WORLD'),],evaluators=[ExactMatch()],)# Evaluate the dataset against a task functionasyncdefuppercase(inputs:dict)->str:returninputs['text'].upper()asyncdefmain():report=awaitdataset.evaluate(uppercase)report.print()''' Evaluation Summary: uppercase┏━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓┃ Case ID ┃ Assertions ┃ Duration ┃┡━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩│ test1 │ ✔ │ 10ms │├──────────┼────────────┼──────────┤│ test2 │ ✔ │ 10ms │├──────────┼────────────┼──────────┤│ Averages │ 100.0% ✔ │ 10ms │└──────────┴────────────┴──────────┘'''
Source code in pydantic_evals/pydantic_evals/dataset.py
classDataset(BaseModel,Generic[InputsT,OutputT,MetadataT],extra='forbid',arbitrary_types_allowed=True):"""A dataset of test [cases][pydantic_evals.Case]. Datasets allow you to organize a collection of test cases and evaluate them against a task function. They can be loaded from and saved to YAML or JSON files, and can have dataset-level evaluators that apply to all cases. Example: ```python # Create a dataset with two test cases from dataclasses import dataclass from pydantic_evals import Case, Dataset from pydantic_evals.evaluators import Evaluator, EvaluatorContext @dataclass class ExactMatch(Evaluator): def evaluate(self, ctx: EvaluatorContext) -> bool: return ctx.output == ctx.expected_output dataset = Dataset( cases=[ Case(name='test1', inputs={'text': 'Hello'}, expected_output='HELLO'), Case(name='test2', inputs={'text': 'World'}, expected_output='WORLD'), ], evaluators=[ExactMatch()], ) # Evaluate the dataset against a task function async def uppercase(inputs: dict) -> str: return inputs['text'].upper() async def main(): report = await dataset.evaluate(uppercase) report.print() ''' Evaluation Summary: uppercase ┏━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ ┃ Case ID ┃ Assertions ┃ Duration ┃ ┡━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ │ test1 │ ✔ │ 10ms │ ├──────────┼────────────┼──────────┤ │ test2 │ ✔ │ 10ms │ ├──────────┼────────────┼──────────┤ │ Averages │ 100.0% ✔ │ 10ms │ └──────────┴────────────┴──────────┘ ''' ``` """cases:list[Case[InputsT,OutputT,MetadataT]]"""List of test cases in the dataset."""evaluators:list[Evaluator[InputsT,OutputT,MetadataT]]=[]"""List of evaluators to be used on all cases in the dataset."""def__init__(self,*,cases:Sequence[Case[InputsT,OutputT,MetadataT]],evaluators:Sequence[Evaluator[InputsT,OutputT,MetadataT]]=(),):"""Initialize a new dataset with test cases and optional evaluators. Args: cases: Sequence of test cases to include in the dataset. evaluators: Optional sequence of evaluators to apply to all cases in the dataset. """case_names=set[str]()forcaseincases:ifcase.nameisNone:continueifcase.nameincase_names:raiseValueError(f'Duplicate case name: {case.name!r}')case_names.add(case.name)super().__init__(cases=cases,evaluators=list(evaluators),)asyncdefevaluate(self,task:Callable[[InputsT],Awaitable[OutputT]],name:str|None=None,max_concurrency:int|None=None)->EvaluationReport:"""Evaluates the test cases in the dataset using the given task. This method runs the task on each case in the dataset, applies evaluators, and collects results into a report. Cases are run concurrently, limited by `max_concurrency` if specified. Args: task: The task to evaluate. This should be a callable that takes the inputs of the case and returns the output. name: The name of the task being evaluated, this is used to identify the task in the report. If omitted, the name of the task function will be used. max_concurrency: The maximum number of concurrent evaluations of the task to allow. If None, all cases will be evaluated concurrently. Returns: A report containing the results of the evaluation. """name=nameorget_unwrapped_function_name(task)limiter=anyio.Semaphore(max_concurrency)ifmax_concurrencyisnotNoneelseAsyncExitStack()with_logfire.span('evaluate {name}',name=name)aseval_span:asyncdef_handle_case(case:Case[InputsT,OutputT,MetadataT],report_case_name:str):asyncwithlimiter:returnawait_run_task_and_evaluators(task,case,report_case_name,self.evaluators)report=EvaluationReport(name=name,cases=awaittask_group_gather([lambdacase=case,i=i:_handle_case(case,case.nameorf'Case {i}')fori,caseinenumerate(self.cases,1)]),)# TODO(DavidM): This attribute will be too big in general; remove it once we can use child spans in details panel:eval_span.set_attribute('cases',report.cases)# TODO(DavidM): Remove this 'averages' attribute once we compute it in the details paneleval_span.set_attribute('averages',report.averages())returnreportdefevaluate_sync(self,task:Callable[[InputsT],Awaitable[OutputT]],name:str|None=None,max_concurrency:int|None=None)->EvaluationReport:# pragma: no cover"""Evaluates the test cases in the dataset using the given task. This is a synchronous wrapper around [`evaluate`][pydantic_evals.Dataset.evaluate] provided for convenience. Args: task: The task to evaluate. This should be a callable that takes the inputs of the case and returns the output. name: The name of the task being evaluated, this is used to identify the task in the report. If omitted, the name of the task function will be used. max_concurrency: The maximum number of concurrent evaluations of the task to allow. If None, all cases will be evaluated concurrently. Returns: A report containing the results of the evaluation. """returnget_event_loop().run_until_complete(self.evaluate(task,name=name,max_concurrency=max_concurrency))defadd_case(self,*,name:str|None=None,inputs:InputsT,metadata:MetadataT|None=None,expected_output:OutputT|None=None,evaluators:tuple[Evaluator[InputsT,OutputT,MetadataT],...]=(),)->None:"""Adds a case to the dataset. This is a convenience method for creating a [`Case`][pydantic_evals.Case] and adding it to the dataset. Args: name: Optional name for the case. If not provided, a generic name will be assigned. inputs: The inputs to the task being evaluated. metadata: Optional metadata for the case, which can be used by evaluators. expected_output: The expected output of the task, used for comparison in evaluators. evaluators: Tuple of evaluators specific to this case, in addition to dataset-level evaluators. """ifnamein{case.nameforcaseinself.cases}:raiseValueError(f'Duplicate case name: {name!r}')case=Case[InputsT,OutputT,MetadataT](name=name,inputs=inputs,metadata=metadata,expected_output=expected_output,evaluators=evaluators,)self.cases.append(case)defadd_evaluator(self,evaluator:Evaluator[InputsT,OutputT,MetadataT],specific_case:str|None=None,)->None:"""Adds an evaluator to the dataset or a specific case. Args: evaluator: The evaluator to add. specific_case: If provided, the evaluator will only be added to the case with this name. If None, the evaluator will be added to all cases in the dataset. Raises: ValueError: If `specific_case` is provided but no case with that name exists in the dataset. """ifspecific_caseisNone:self.evaluators.append(evaluator)else:# If this is too slow, we could try to add a case lookup dict.# Note that if we do that, we'd need to make the cases list private to prevent modification.added=Falseforcaseinself.cases:ifcase.name==specific_case:case.evaluators.append(evaluator)added=Trueifnotadded:raiseValueError(f'Case {specific_case!r} not found in the dataset')@classmethod@functools.cachedef_params(cls)->tuple[type[InputsT],type[OutputT],type[MetadataT]]:"""Get the type parameters for the Dataset class. Returns: A tuple of (InputsT, OutputT, MetadataT) types. """forcincls.__mro__:metadata=getattr(c,'__pydantic_generic_metadata__',{})iflen(args:=(metadata.get('args',())orgetattr(c,'__args__',())))==3:returnargselse:# pragma: no coverwarnings.warn(f'Could not determine the generic parameters for {cls}; using `Any` for each.'f' You should explicitly set the generic parameters via `Dataset[MyInputs, MyOutput, MyMetadata]`'f' when serializing or deserializing.',UserWarning,)returnAny,Any,Any# type: ignore@classmethoddeffrom_file(cls,path:Path|str,fmt:Literal['yaml','json']|None=None,custom_evaluator_types:Sequence[type[Evaluator[InputsT,OutputT,MetadataT]]]=(),)->Self:"""Load a dataset from a file. Args: path: Path to the file to load. fmt: Format of the file. If None, the format will be inferred from the file extension. Must be either 'yaml' or 'json'. custom_evaluator_types: Custom evaluator classes to use when deserializing the dataset. These are additional evaluators beyond the default ones. Returns: A new Dataset instance loaded from the file. Raises: ValidationError: If the file cannot be parsed as a valid dataset. ValueError: If the format cannot be inferred from the file extension. """path=Path(path)fmt=cls._infer_fmt(path,fmt)raw=Path(path).read_text()try:returncls.from_text(raw,fmt=fmt,custom_evaluator_types=custom_evaluator_types)exceptValidationErrorase:# pragma: no coverraiseValueError(f'{path} contains data that does not match the schema for {cls.__name__}:\n{e}.')frome@classmethoddeffrom_text(cls,contents:str,fmt:Literal['yaml','json']='yaml',custom_evaluator_types:Sequence[type[Evaluator[InputsT,OutputT,MetadataT]]]=(),)->Self:"""Load a dataset from a string. Args: contents: The string content to parse. fmt: Format of the content. Must be either 'yaml' or 'json'. custom_evaluator_types: Custom evaluator classes to use when deserializing the dataset. These are additional evaluators beyond the default ones. Returns: A new Dataset instance parsed from the string. Raises: ValidationError: If the content cannot be parsed as a valid dataset. """iffmt=='yaml':loaded=yaml.safe_load(contents)returncls.from_dict(loaded,custom_evaluator_types)else:dataset_model_type=cls._serialization_type()dataset_model=dataset_model_type.model_validate_json(contents)returncls._from_dataset_model(dataset_model,custom_evaluator_types)@classmethoddeffrom_dict(cls,data:dict[str,Any],custom_evaluator_types:Sequence[type[Evaluator[InputsT,OutputT,MetadataT]]]=(),)->Self:"""Load a dataset from a dictionary. Args: data: Dictionary representation of the dataset. custom_evaluator_types: Custom evaluator classes to use when deserializing the dataset. These are additional evaluators beyond the default ones. Returns: A new Dataset instance created from the dictionary. Raises: ValidationError: If the dictionary cannot be converted to a valid dataset. """dataset_model_type=cls._serialization_type()dataset_model=dataset_model_type.model_validate(data)returncls._from_dataset_model(dataset_model,custom_evaluator_types)@classmethoddef_from_dataset_model(cls,dataset_model:_DatasetModel[InputsT,OutputT,MetadataT],custom_evaluator_types:Sequence[type[Evaluator[InputsT,OutputT,MetadataT]]]=(),)->Self:"""Create a Dataset from a _DatasetModel. Args: dataset_model: The _DatasetModel to convert. custom_evaluator_types: Custom evaluator classes to register for deserialization. Returns: A new Dataset instance created from the _DatasetModel. """registry=_get_registry(custom_evaluator_types)cases:list[Case[InputsT,OutputT,MetadataT]]=[]errors:list[ValueError]=[]dataset_evaluators:list[Evaluator]=[]forspecindataset_model.evaluators:try:dataset_evaluator=_load_evaluator_from_registry(registry,None,spec)exceptValueErrorase:errors.append(e)continuedataset_evaluators.append(dataset_evaluator)forrowindataset_model.cases:evaluators:list[Evaluator]=[]forspecinrow.evaluators:try:evaluator=_load_evaluator_from_registry(registry,row.name,spec)exceptValueErrorase:errors.append(e)continueevaluators.append(evaluator)row=Case[InputsT,OutputT,MetadataT](name=row.name,inputs=row.inputs,metadata=row.metadata,expected_output=row.expected_output,)row.evaluators=evaluatorscases.append(row)iferrors:raiseExceptionGroup(f'{len(errors)} error(s) loading evaluators from registry',errors[:3])result=cls(cases=cases)result.evaluators=dataset_evaluatorsreturnresultdefto_file(self,path:Path|str,fmt:Literal['yaml','json']|None=None,schema_path:Path|str|None=DEFAULT_SCHEMA_PATH_TEMPLATE,custom_evaluator_types:Sequence[type[Evaluator[InputsT,OutputT,MetadataT]]]=(),):"""Save the dataset to a file. Args: path: Path to save the dataset to. fmt: Format to use. If None, the format will be inferred from the file extension. Must be either 'yaml' or 'json'. schema_path: Path to save the JSON schema to. If None, no schema will be saved. Can be a string template with {stem} which will be replaced with the dataset filename stem. custom_evaluator_types: Custom evaluator classes to include in the schema. """path=Path(path)fmt=self._infer_fmt(path,fmt)schema_ref:str|None=Noneifschema_pathisnotNone:ifisinstance(schema_path,str):schema_path=Path(schema_path.format(stem=path.stem))ifnotschema_path.is_absolute():schema_ref=str(schema_path)schema_path=path.parent/schema_pathelifschema_path.is_relative_to(path):# pragma: no coverschema_ref=str(_get_relative_path_reference(schema_path,path))else:# pragma: no coverschema_ref=str(schema_path)self._save_schema(schema_path,custom_evaluator_types)context:dict[str,Any]={'use_short_form':True}iffmt=='yaml':dumped_data=self.model_dump(mode='json',by_alias=True,exclude_defaults=True,context=context)content=yaml.dump(dumped_data,sort_keys=False)ifschema_ref:yaml_language_server_line=f'{_YAML_SCHEMA_LINE_PREFIX}{schema_ref}'content=f'{yaml_language_server_line}\n{content}'path.write_text(content)else:context['$schema']=schema_refjson_data=self.model_dump_json(indent=2,by_alias=True,exclude_defaults=True,context=context)path.write_text(json_data+'\n')@classmethoddefmodel_json_schema_with_evaluators(cls,custom_evaluator_types:Sequence[type[Evaluator[InputsT,OutputT,MetadataT]]]=(),)->dict[str,Any]:"""Generate a JSON schema for this dataset type, including evaluator details. This is useful for generating a schema that can be used to validate YAML-format dataset files. Args: custom_evaluator_types: Custom evaluator classes to include in the schema. Returns: A dictionary representing the JSON schema. """# Note: this function could maybe be simplified now that Evaluators are always dataclassesregistry=_get_registry(custom_evaluator_types)evaluator_schema_types:list[Any]=[]forname,evaluator_classinregistry.items():type_hints=_typing_extra.get_function_type_hints(evaluator_class)type_hints.pop('return',None)required_type_hints:dict[str,Any]={}forpininspect.signature(evaluator_class).parameters.values():type_hints.setdefault(p.name,Any)ifp.defaultisnotp.empty:type_hints[p.name]=NotRequired[type_hints[p.name]]else:required_type_hints[p.name]=type_hints[p.name]def_make_typed_dict(cls_name_prefix:str,fields:dict[str,Any])->Any:td=TypedDict(f'{cls_name_prefix}_{name}',fields)# pyright: ignore[reportArgumentType]config=ConfigDict(extra='forbid',arbitrary_types_allowed=True)# TODO: Replace with pydantic.with_config after pydantic 2.11 is releasedtd.__pydantic_config__=config# pyright: ignore[reportAttributeAccessIssue]returntd# Shortest form: just the call nameiflen(type_hints)==0ornotrequired_type_hints:evaluator_schema_types.append(Literal[name])# Short form: can be called with only one parameteriflen(type_hints)==1:[type_hint_type]=type_hints.values()evaluator_schema_types.append(_make_typed_dict('short_evaluator',{name:type_hint_type}))eliflen(required_type_hints)==1:[type_hint_type]=required_type_hints.values()evaluator_schema_types.append(_make_typed_dict('short_evaluator',{name:type_hint_type}))# Long form: multiple parameters, possibly requirediflen(type_hints)>1:params_td=_make_typed_dict('evaluator_params',type_hints)evaluator_schema_types.append(_make_typed_dict('evaluator',{name:params_td}))in_type,out_type,meta_type=cls._params()# Note: we shadow the `Case` and `Dataset` class names here to generate a clean JSON schemaclassCase(BaseModel,extra='forbid'):# pyright: ignore[reportUnusedClass] # this _is_ used below, but pyright doesn't seem to notice..name:str|None=Noneinputs:in_type# pyright: ignore[reportInvalidTypeForm]metadata:meta_type|None=None# pyright: ignore[reportInvalidTypeForm,reportUnknownVariableType]expected_output:out_type|None=None# pyright: ignore[reportInvalidTypeForm,reportUnknownVariableType]ifevaluator_schema_types:evaluators:list[Union[tuple(evaluator_schema_types)]]=[]# pyright: ignore # noqa UP007classDataset(BaseModel,extra='forbid'):cases:list[Case]ifevaluator_schema_types:evaluators:list[Union[tuple(evaluator_schema_types)]]=[]# pyright: ignore # noqa UP007json_schema=Dataset.model_json_schema()# See `_add_json_schema` below, since `$schema` is added to the JSON, it has to be supported in the JSONjson_schema['properties']['$schema']={'type':'string'}returnjson_schema@classmethoddef_save_schema(cls,path:Path|str,custom_evaluator_types:Sequence[type[Evaluator[InputsT,OutputT,MetadataT]]]=()):"""Save the JSON schema for this dataset type to a file. Args: path: Path to save the schema to. custom_evaluator_types: Custom evaluator classes to include in the schema. """path=Path(path)json_schema=cls.model_json_schema_with_evaluators(custom_evaluator_types)schema_content=to_json(json_schema,indent=2).decode()+'\n'ifnotpath.exists()orpath.read_text()!=schema_content:path.write_text(schema_content)@classmethod@functools.cachedef_serialization_type(cls)->type[_DatasetModel[InputsT,OutputT,MetadataT]]:"""Get the serialization type for this dataset class. Returns: A _DatasetModel type with the same generic parameters as this Dataset class. """input_type,output_type,metadata_type=cls._params()return_DatasetModel[input_type,output_type,metadata_type]@classmethoddef_infer_fmt(cls,path:Path,fmt:Literal['yaml','json']|None)->Literal['yaml','json']:"""Infer the format to use for a file based on its extension. Args: path: The path to infer the format for. fmt: The explicitly provided format, if any. Returns: The inferred format ('yaml' or 'json'). Raises: ValueError: If the format cannot be inferred from the file extension. """iffmtisnotNone:returnfmtsuffix=path.suffix.lower()ifsuffixin{'.yaml','.yml'}:return'yaml'elifsuffix=='.json':return'json'raiseValueError(f'Could not infer format for filename {path.name!r}. Use the `fmt` argument to specify the format.')@model_serializer(mode='wrap')def_add_json_schema(self,nxt:SerializerFunctionWrapHandler,info:SerializationInfo)->dict[str,Any]:"""Add the JSON schema path to the serialized output. See <https://github.com/json-schema-org/json-schema-spec/issues/828> for context, that seems to be the nearest there is to a spec for this. """context=cast(Union[dict[str,Any],None],info.context)ifisinstance(context,dict)and(schema:=context.get('$schema')):return{'$schema':schema}|nxt(self)else:returnnxt(self)
def__init__(self,*,cases:Sequence[Case[InputsT,OutputT,MetadataT]],evaluators:Sequence[Evaluator[InputsT,OutputT,MetadataT]]=(),):"""Initialize a new dataset with test cases and optional evaluators. Args: cases: Sequence of test cases to include in the dataset. evaluators: Optional sequence of evaluators to apply to all cases in the dataset. """case_names=set[str]()forcaseincases:ifcase.nameisNone:continueifcase.nameincase_names:raiseValueError(f'Duplicate case name: {case.name!r}')case_names.add(case.name)super().__init__(cases=cases,evaluators=list(evaluators),)
Evaluates the test cases in the dataset using the given task.
This method runs the task on each case in the dataset, applies evaluators,
and collects results into a report. Cases are run concurrently, limited by max_concurrency if specified.
asyncdefevaluate(self,task:Callable[[InputsT],Awaitable[OutputT]],name:str|None=None,max_concurrency:int|None=None)->EvaluationReport:"""Evaluates the test cases in the dataset using the given task. This method runs the task on each case in the dataset, applies evaluators, and collects results into a report. Cases are run concurrently, limited by `max_concurrency` if specified. Args: task: The task to evaluate. This should be a callable that takes the inputs of the case and returns the output. name: The name of the task being evaluated, this is used to identify the task in the report. If omitted, the name of the task function will be used. max_concurrency: The maximum number of concurrent evaluations of the task to allow. If None, all cases will be evaluated concurrently. Returns: A report containing the results of the evaluation. """name=nameorget_unwrapped_function_name(task)limiter=anyio.Semaphore(max_concurrency)ifmax_concurrencyisnotNoneelseAsyncExitStack()with_logfire.span('evaluate {name}',name=name)aseval_span:asyncdef_handle_case(case:Case[InputsT,OutputT,MetadataT],report_case_name:str):asyncwithlimiter:returnawait_run_task_and_evaluators(task,case,report_case_name,self.evaluators)report=EvaluationReport(name=name,cases=awaittask_group_gather([lambdacase=case,i=i:_handle_case(case,case.nameorf'Case {i}')fori,caseinenumerate(self.cases,1)]),)# TODO(DavidM): This attribute will be too big in general; remove it once we can use child spans in details panel:eval_span.set_attribute('cases',report.cases)# TODO(DavidM): Remove this 'averages' attribute once we compute it in the details paneleval_span.set_attribute('averages',report.averages())returnreport
defevaluate_sync(self,task:Callable[[InputsT],Awaitable[OutputT]],name:str|None=None,max_concurrency:int|None=None)->EvaluationReport:# pragma: no cover"""Evaluates the test cases in the dataset using the given task. This is a synchronous wrapper around [`evaluate`][pydantic_evals.Dataset.evaluate] provided for convenience. Args: task: The task to evaluate. This should be a callable that takes the inputs of the case and returns the output. name: The name of the task being evaluated, this is used to identify the task in the report. If omitted, the name of the task function will be used. max_concurrency: The maximum number of concurrent evaluations of the task to allow. If None, all cases will be evaluated concurrently. Returns: A report containing the results of the evaluation. """returnget_event_loop().run_until_complete(self.evaluate(task,name=name,max_concurrency=max_concurrency))
defadd_case(self,*,name:str|None=None,inputs:InputsT,metadata:MetadataT|None=None,expected_output:OutputT|None=None,evaluators:tuple[Evaluator[InputsT,OutputT,MetadataT],...]=(),)->None:"""Adds a case to the dataset. This is a convenience method for creating a [`Case`][pydantic_evals.Case] and adding it to the dataset. Args: name: Optional name for the case. If not provided, a generic name will be assigned. inputs: The inputs to the task being evaluated. metadata: Optional metadata for the case, which can be used by evaluators. expected_output: The expected output of the task, used for comparison in evaluators. evaluators: Tuple of evaluators specific to this case, in addition to dataset-level evaluators. """ifnamein{case.nameforcaseinself.cases}:raiseValueError(f'Duplicate case name: {name!r}')case=Case[InputsT,OutputT,MetadataT](name=name,inputs=inputs,metadata=metadata,expected_output=expected_output,evaluators=evaluators,)self.cases.append(case)
defadd_evaluator(self,evaluator:Evaluator[InputsT,OutputT,MetadataT],specific_case:str|None=None,)->None:"""Adds an evaluator to the dataset or a specific case. Args: evaluator: The evaluator to add. specific_case: If provided, the evaluator will only be added to the case with this name. If None, the evaluator will be added to all cases in the dataset. Raises: ValueError: If `specific_case` is provided but no case with that name exists in the dataset. """ifspecific_caseisNone:self.evaluators.append(evaluator)else:# If this is too slow, we could try to add a case lookup dict.# Note that if we do that, we'd need to make the cases list private to prevent modification.added=Falseforcaseinself.cases:ifcase.name==specific_case:case.evaluators.append(evaluator)added=Trueifnotadded:raiseValueError(f'Case {specific_case!r} not found in the dataset')
@classmethoddeffrom_file(cls,path:Path|str,fmt:Literal['yaml','json']|None=None,custom_evaluator_types:Sequence[type[Evaluator[InputsT,OutputT,MetadataT]]]=(),)->Self:"""Load a dataset from a file. Args: path: Path to the file to load. fmt: Format of the file. If None, the format will be inferred from the file extension. Must be either 'yaml' or 'json'. custom_evaluator_types: Custom evaluator classes to use when deserializing the dataset. These are additional evaluators beyond the default ones. Returns: A new Dataset instance loaded from the file. Raises: ValidationError: If the file cannot be parsed as a valid dataset. ValueError: If the format cannot be inferred from the file extension. """path=Path(path)fmt=cls._infer_fmt(path,fmt)raw=Path(path).read_text()try:returncls.from_text(raw,fmt=fmt,custom_evaluator_types=custom_evaluator_types)exceptValidationErrorase:# pragma: no coverraiseValueError(f'{path} contains data that does not match the schema for {cls.__name__}:\n{e}.')frome
@classmethoddeffrom_text(cls,contents:str,fmt:Literal['yaml','json']='yaml',custom_evaluator_types:Sequence[type[Evaluator[InputsT,OutputT,MetadataT]]]=(),)->Self:"""Load a dataset from a string. Args: contents: The string content to parse. fmt: Format of the content. Must be either 'yaml' or 'json'. custom_evaluator_types: Custom evaluator classes to use when deserializing the dataset. These are additional evaluators beyond the default ones. Returns: A new Dataset instance parsed from the string. Raises: ValidationError: If the content cannot be parsed as a valid dataset. """iffmt=='yaml':loaded=yaml.safe_load(contents)returncls.from_dict(loaded,custom_evaluator_types)else:dataset_model_type=cls._serialization_type()dataset_model=dataset_model_type.model_validate_json(contents)returncls._from_dataset_model(dataset_model,custom_evaluator_types)
@classmethoddeffrom_dict(cls,data:dict[str,Any],custom_evaluator_types:Sequence[type[Evaluator[InputsT,OutputT,MetadataT]]]=(),)->Self:"""Load a dataset from a dictionary. Args: data: Dictionary representation of the dataset. custom_evaluator_types: Custom evaluator classes to use when deserializing the dataset. These are additional evaluators beyond the default ones. Returns: A new Dataset instance created from the dictionary. Raises: ValidationError: If the dictionary cannot be converted to a valid dataset. """dataset_model_type=cls._serialization_type()dataset_model=dataset_model_type.model_validate(data)returncls._from_dataset_model(dataset_model,custom_evaluator_types)
Path to save the JSON schema to. If None, no schema will be saved.
Can be a string template with {stem} which will be replaced with the dataset filename stem.
defto_file(self,path:Path|str,fmt:Literal['yaml','json']|None=None,schema_path:Path|str|None=DEFAULT_SCHEMA_PATH_TEMPLATE,custom_evaluator_types:Sequence[type[Evaluator[InputsT,OutputT,MetadataT]]]=(),):"""Save the dataset to a file. Args: path: Path to save the dataset to. fmt: Format to use. If None, the format will be inferred from the file extension. Must be either 'yaml' or 'json'. schema_path: Path to save the JSON schema to. If None, no schema will be saved. Can be a string template with {stem} which will be replaced with the dataset filename stem. custom_evaluator_types: Custom evaluator classes to include in the schema. """path=Path(path)fmt=self._infer_fmt(path,fmt)schema_ref:str|None=Noneifschema_pathisnotNone:ifisinstance(schema_path,str):schema_path=Path(schema_path.format(stem=path.stem))ifnotschema_path.is_absolute():schema_ref=str(schema_path)schema_path=path.parent/schema_pathelifschema_path.is_relative_to(path):# pragma: no coverschema_ref=str(_get_relative_path_reference(schema_path,path))else:# pragma: no coverschema_ref=str(schema_path)self._save_schema(schema_path,custom_evaluator_types)context:dict[str,Any]={'use_short_form':True}iffmt=='yaml':dumped_data=self.model_dump(mode='json',by_alias=True,exclude_defaults=True,context=context)content=yaml.dump(dumped_data,sort_keys=False)ifschema_ref:yaml_language_server_line=f'{_YAML_SCHEMA_LINE_PREFIX}{schema_ref}'content=f'{yaml_language_server_line}\n{content}'path.write_text(content)else:context['$schema']=schema_refjson_data=self.model_dump_json(indent=2,by_alias=True,exclude_defaults=True,context=context)path.write_text(json_data+'\n')
@classmethoddefmodel_json_schema_with_evaluators(cls,custom_evaluator_types:Sequence[type[Evaluator[InputsT,OutputT,MetadataT]]]=(),)->dict[str,Any]:"""Generate a JSON schema for this dataset type, including evaluator details. This is useful for generating a schema that can be used to validate YAML-format dataset files. Args: custom_evaluator_types: Custom evaluator classes to include in the schema. Returns: A dictionary representing the JSON schema. """# Note: this function could maybe be simplified now that Evaluators are always dataclassesregistry=_get_registry(custom_evaluator_types)evaluator_schema_types:list[Any]=[]forname,evaluator_classinregistry.items():type_hints=_typing_extra.get_function_type_hints(evaluator_class)type_hints.pop('return',None)required_type_hints:dict[str,Any]={}forpininspect.signature(evaluator_class).parameters.values():type_hints.setdefault(p.name,Any)ifp.defaultisnotp.empty:type_hints[p.name]=NotRequired[type_hints[p.name]]else:required_type_hints[p.name]=type_hints[p.name]def_make_typed_dict(cls_name_prefix:str,fields:dict[str,Any])->Any:td=TypedDict(f'{cls_name_prefix}_{name}',fields)# pyright: ignore[reportArgumentType]config=ConfigDict(extra='forbid',arbitrary_types_allowed=True)# TODO: Replace with pydantic.with_config after pydantic 2.11 is releasedtd.__pydantic_config__=config# pyright: ignore[reportAttributeAccessIssue]returntd# Shortest form: just the call nameiflen(type_hints)==0ornotrequired_type_hints:evaluator_schema_types.append(Literal[name])# Short form: can be called with only one parameteriflen(type_hints)==1:[type_hint_type]=type_hints.values()evaluator_schema_types.append(_make_typed_dict('short_evaluator',{name:type_hint_type}))eliflen(required_type_hints)==1:[type_hint_type]=required_type_hints.values()evaluator_schema_types.append(_make_typed_dict('short_evaluator',{name:type_hint_type}))# Long form: multiple parameters, possibly requirediflen(type_hints)>1:params_td=_make_typed_dict('evaluator_params',type_hints)evaluator_schema_types.append(_make_typed_dict('evaluator',{name:params_td}))in_type,out_type,meta_type=cls._params()# Note: we shadow the `Case` and `Dataset` class names here to generate a clean JSON schemaclassCase(BaseModel,extra='forbid'):# pyright: ignore[reportUnusedClass] # this _is_ used below, but pyright doesn't seem to notice..name:str|None=Noneinputs:in_type# pyright: ignore[reportInvalidTypeForm]metadata:meta_type|None=None# pyright: ignore[reportInvalidTypeForm,reportUnknownVariableType]expected_output:out_type|None=None# pyright: ignore[reportInvalidTypeForm,reportUnknownVariableType]ifevaluator_schema_types:evaluators:list[Union[tuple(evaluator_schema_types)]]=[]# pyright: ignore # noqa UP007classDataset(BaseModel,extra='forbid'):cases:list[Case]ifevaluator_schema_types:evaluators:list[Union[tuple(evaluator_schema_types)]]=[]# pyright: ignore # noqa UP007json_schema=Dataset.model_json_schema()# See `_add_json_schema` below, since `$schema` is added to the JSON, it has to be supported in the JSONjson_schema['properties']['$schema']={'type':'string'}returnjson_schema
Source code in pydantic_evals/pydantic_evals/dataset.py
973974975976977978979980981982
defset_eval_attribute(name:str,value:Any)->None:"""Set an attribute on the current task run. Args: name: The name of the attribute. value: The value of the attribute. """current_case=_CURRENT_TASK_RUN.get()ifcurrent_caseisnotNone:current_case.record_attribute(name,value)
Source code in pydantic_evals/pydantic_evals/dataset.py
985986987988989990991992993994
defincrement_eval_metric(name:str,amount:int|float)->None:"""Increment a metric on the current task run. Args: name: The name of the metric. amount: The amount to increment by. """current_case=_CURRENT_TASK_RUN.get()ifcurrent_caseisnotNone:current_case.increment_metric(name,amount)