eval-protocol · xzrderek · Aug 2, 2025 · Aug 2, 2025 · Aug 2, 2025 · Aug 2, 2025
diff --git a/tests/pytest/data/markdown_dataset.jsonl b/tests/pytest/data/markdown_dataset.jsonl
@@ -0,0 +1,19 @@
+{"key": 1307, "prompt": "Write an outline for a paper on the history of Yemeni coffee. The outline should include the main points of the paper, and at least 15 sections should be highlighted with markdown such as *highlighted section*.", "num_highlights": 15}
+{"key": 1644, "prompt": "Write a cover letter for a job at a local coffee shop in the form of a poem. Highlight at least 5 text sections using \"*\". For example: *3 years of experience*.", "num_highlights": 5}
+{"key": 1646, "prompt": "Write a casual blog post about similarities across animal species. Highlight at least 5 sections in your answer by starting and ending with \"*\", like: *highlighted text section*.", "num_highlights": 5}
+{"key": 167, "prompt": "Generate a business proposal to start a sweatshirt company in Bremen. The proposal should contain 5 or more sections. Highlight each section name using the this format:\n*section name*", "num_highlights": 5}
+{"key": 168, "prompt": "Write a funny and sarcastic template for rating the quality of a marriage between two people who are both moms. This is for the couple themselves. Please highlight at least 3 sections with markdown,  i.e *highlighted section*.", "num_highlights": 3}
+{"key": 1773, "prompt": "Write a song about the summers of my childhood that I spent in the countryside. Give the song a name, and highlight the name by wrapping it with *. For example: *little me in the countryside*.", "num_highlights": 1}
+{"key": 1886, "prompt": "Write a riddle for the word \"fa\u00e7ade\" that contains at least 3 italic text phrases in markdown syntax, i.e *italic text*.", "num_highlights": 3}
+{"key": 2253, "prompt": "Write a template for a workshop on the importance of diversity in the workplace and highlight at least 3 sections with markdown, i.e. *highlighted section*.", "num_highlights": 3}
+{"key": 2381, "prompt": "Write a cover letter to a local political party, asking to be their rally organizer. Make sure to highlight at least 3 sections in your answer in markdown format.", "num_highlights": 3}
+{"key": 2752, "prompt": "The opposite of youth is not age, but ...? Highlight at least 2 sections in your answer with markdown, i.e. *highlighted section*.", "num_highlights": 2}
+{"key": 2759, "prompt": "Write a description of the following data in a weird style: The Golden Palace eatType restaurant; The Golden Palace food Indian; The Golden Palace area city centre. Use markdown to highlight at least 3 sections in your answer.", "num_highlights": 3}
+{"key": 2790, "prompt": "Write a funny rap about a man who gets a call from an official saying that he is a long lost relative of the king of Nigeria. Use markdown to highlight at least one section of your answer, i.e. *highlighted section*.", "num_highlights": 1}
+{"key": 2905, "prompt": "Expand the riddle into a story with a funny tone:\n\nWhat can you catch but not throw?\nA cold\n\nUse * to highlight at least 2 sections in your text. For example: *this is a highlighted text section*.", "num_highlights": 2}
+{"key": 3071, "prompt": "Write a rap about the renaissance. It should be noticeably different from raps about other historical eras, and have an interesting or weird tone. Highlight at least 3 sections in your answer in markdown format.", "num_highlights": 3}
+{"key": 3453, "prompt": "Summarize the history of Japan. Italicize at least 5 keywords in your response. To indicate a italic word, wrap it with asterisk, like *italic*", "num_highlights": 5}
+{"key": 3549, "prompt": "Write a funny Haiku about a Quaker named John who lives in the town of De Smet, South Dakota. Use the asterisk symbol, *, to highlight some words or phrases twice. Example: *This is a highlighted phrase*.", "num_highlights": 2}
+{"key": 3629, "prompt": "Today, at the 54th Annual Grammy Awards, the Recording Academy honors the talent and creativity of the artists, musicians, and producers who are the creators of the best recordings of the past year. Please continue writing this text in a formal tone, using notations. Highlight some key parts in your response with \"*\", like *highlighted text*.", "num_highlights": 1}
+{"key": 3644, "prompt": "Write a blog post about interesting facts about the Dutch language. Italicize at least 2 sections in your answer with markdown, i.e. *italic text*.", "num_highlights": 2}
+{"key": 2515, "prompt": "Gideon is a farmer who has a surplus of crops from his farm this year. What might he do with that surplus? Highlight at least one section of your answer in markdown, i.e *highlighted section*.", "num_highlights": 1}
diff --git a/tests/pytest/test_markdown_highlighting.py b/tests/pytest/test_markdown_highlighting.py
@@ -0,0 +1,84 @@
+"""
+Pytest test for markdown highlighting validation using the evaluation_test decorator.
+
+This test demonstrates how to check if model responses contain the required number of highlighted sections.
+"""
+
+import json
+import re
+from typing import Any, Dict, List, Optional
+
+from eval_protocol.models import EvaluateResult, EvaluationRow, Message
+from eval_protocol.pytest import evaluation_test, default_single_turn_rollout_processor, evaluate
+
+
+def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
+    """
+    Convert entries from markdown dataset to EvaluationRow objects.
+    """    
+    return [
+        EvaluationRow(
+            messages=[Message(role="user", content=row["prompt"])], 
+            ground_truth=str(row["num_highlights"])
+        )
+        for row in data
+    ]
+
+
+def markdown_format_evaluate(messages: List[Message], ground_truth: Optional[str]=None, **kwargs) -> EvaluateResult:
+    """
+    Evaluation function that checks if the model's response contains the required number of formatted sections.
+    """
+
+    assistant_response = messages[-1].content
+
+    if not assistant_response:
+        return EvaluateResult(
+            score=0.0,
+            reason="❌ No assistant response found"
+        )
+
+    required_highlights = int(ground_truth)
+
+    # Check if the response contains the required number of formatted sections
+    # e.g. **bold** or *italic*
+
+    actual_count = 0
+    highlights = re.findall(r"\*[^\n\*]*\*", assistant_response)
+    double_highlights = re.findall(r"\*\*[^\n\*]*\*\*", assistant_response)
+
+    for highlight in highlights:
+        if highlight.strip("*").strip():
+            actual_count += 1
+    for highlight in double_highlights:
+        if highlight.removeprefix("**").removesuffix("**").strip():
+            actual_count += 1
+
+    meets_requirement = actual_count >= required_highlights
+
+    if meets_requirement:
+        return EvaluateResult(
+            score=1.0,
+            reason=f"✅ Found {actual_count} highlighted sections (required: {required_highlights})"
+        )
+    else:
+        return EvaluateResult(
+            score=0.0,
+            reason=f"❌ Only found {actual_count} highlighted sections (required: {required_highlights})"
+        )
+
+
+@evaluation_test(
+    input_dataset=["tests/pytest/data/markdown_dataset.jsonl"],
+    dataset_adapter=markdown_dataset_to_evaluation_row,
+    model=["accounts/fireworks/models/llama-v3p1-8b-instruct"],
+    input_params=[{"temperature": 0.0, "max_tokens": 4096}],  
+    threshold_of_success=1.0,
+    rollout_processor=default_single_turn_rollout_processor,
+    num_runs=1
+)
+def test_markdown_highlighting_evaluation(input_dataset, input_params, model):
+    """
+    Test markdown highlighting validation using batch mode with evaluate().
+    """
+    return evaluate(input_dataset, markdown_format_evaluate)