# 1. Define success criteria
# Target: 95% accuracy on 100-message test set
# 2. Create evaluation dataset
eval_data = [
{"message": "Your product broke after one day!", "label": "negative"},
{"message": "It works fine, nothing special", "label": "neutral"},
{"message": "Best purchase of my life!", "label": "positive"},
# ... 97 more examples
]
# 3. Test current prompt
def test_prompt(prompt_template: str, eval_data: list) -> float:
correct = 0
for item in eval_data:
prompt = prompt_template.format(message=item["message"])
prediction = llm.generate(prompt)
if prediction.strip().lower() == item["label"]:
correct += 1
accuracy = correct / len(eval_data)
return accuracy
# 4. Analyze failures
def analyze_failures(prompt_template: str, eval_data: list):
failures = []
for item in eval_data:
prompt = prompt_template.format(message=item["message"])
prediction = llm.generate(prompt)
if prediction.strip().lower() != item["label"]:
failures.append({
"input": item["message"],
"expected": item["label"],
"actual": prediction,
})
return failures
# 5. Iterate
v1_prompt = "Classify sentiment: {message}"
v1_accuracy = test_prompt(v1_prompt, eval_data) # 78%
v2_prompt = """
Classify the sentiment of this message as positive, neutral, or negative.
Message: {message}
Sentiment:"""
v2_accuracy = test_prompt(v2_prompt, eval_data) # 89%
v3_prompt = """
<task>Classify customer sentiment</task>
<examples>
Positive: "Love this!", "Best ever!"
Neutral: "It's okay", "Does the job"
Negative: "Terrible", "Waste of money"
</examples>
<message>{message}</message>
<output>positive|neutral|negative</output>
"""
v3_accuracy = test_prompt(v3_prompt, eval_data) # 96% ✓