Optymalizacja soneta-form-xml skill
This commit is contained in:
@@ -0,0 +1,187 @@
|
||||
{
|
||||
"metadata": {
|
||||
"skill_name": "soneta-form-xml",
|
||||
"skill_path": "/Users/marcin/d/Skills/soneta-form-xml",
|
||||
"executor_model": "claude-sonnet-4-6",
|
||||
"timestamp": "2026-03-05T12:00:00Z",
|
||||
"evals_run": ["pageform-z-gridem", "viewform-lista", "warunkowa-widocznosc"],
|
||||
"runs_per_configuration": 1
|
||||
},
|
||||
|
||||
"runs": [
|
||||
{
|
||||
"eval_id": 1,
|
||||
"eval_name": "pageform-z-gridem",
|
||||
"configuration": "with_skill",
|
||||
"run_number": 1,
|
||||
"result": {
|
||||
"pass_rate": 1.0,
|
||||
"passed": 8,
|
||||
"failed": 0,
|
||||
"total": 8,
|
||||
"time_seconds": 37.8,
|
||||
"tokens": 32707,
|
||||
"tool_calls": 9,
|
||||
"errors": 0
|
||||
},
|
||||
"expectations": [
|
||||
{"text": "Plik zaczyna się od <?xml...?>", "passed": true, "evidence": "Poprawna deklaracja XML"},
|
||||
{"text": "DataForm ma Priority=\"10\"", "passed": true, "evidence": "Atrybut w elemencie root"},
|
||||
{"text": "Page ma DataContext=\"{DataSource}\"", "passed": true, "evidence": "Linia 8"},
|
||||
{"text": "Trzy grupy: Dane podstawowe, Warunki, Pozycje", "passed": true, "evidence": "Linie 10,17,30"},
|
||||
{"text": "Układ Row>Stack w grupie Warunki", "passed": true, "evidence": "Linie 18-28"},
|
||||
{"text": "Grid: IsToolbarVisible, EditInPlace, NewInPlace", "passed": true, "evidence": "Linie 31-35"},
|
||||
{"text": "Footer=Sum na Ilosc i Wartosc", "passed": true, "evidence": "Linie 38,40"},
|
||||
{"text": "Binding {NazwaPola}", "passed": true, "evidence": "EditValue we wszystkich polach"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"eval_id": 1,
|
||||
"eval_name": "pageform-z-gridem",
|
||||
"configuration": "without_skill",
|
||||
"run_number": 1,
|
||||
"result": {
|
||||
"pass_rate": 0.5,
|
||||
"passed": 4,
|
||||
"failed": 4,
|
||||
"total": 8,
|
||||
"time_seconds": 17.7,
|
||||
"tokens": 4263,
|
||||
"tool_calls": 3,
|
||||
"errors": 0
|
||||
},
|
||||
"expectations": [
|
||||
{"text": "Plik zaczyna się od <?xml...?>", "passed": true, "evidence": "Poprawna deklaracja"},
|
||||
{"text": "DataForm ma Priority=\"10\"", "passed": false, "evidence": "Używa <PageForm.Priority> zamiast atrybutu"},
|
||||
{"text": "Page ma DataContext=\"{DataSource}\"", "passed": false, "evidence": "Brak Page i DataContext"},
|
||||
{"text": "Trzy grupy: Dane podstawowe, Warunki, Pozycje", "passed": false, "evidence": "Grupy z Name= zamiast CaptionHtml="},
|
||||
{"text": "Układ Row>Stack w grupie Warunki", "passed": true, "evidence": "Row>Stack obecne"},
|
||||
{"text": "Grid: IsToolbarVisible, EditInPlace, NewInPlace", "passed": true, "evidence": "Atrybuty obecne"},
|
||||
{"text": "Footer=Sum na Ilosc i Wartosc", "passed": true, "evidence": "Footer=Sum na Column"},
|
||||
{"text": "Binding {NazwaPola}", "passed": false, "evidence": "Brak EditValue — używa Name= na Control"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"eval_id": 2,
|
||||
"eval_name": "viewform-lista",
|
||||
"configuration": "with_skill",
|
||||
"run_number": 1,
|
||||
"result": {
|
||||
"pass_rate": 1.0,
|
||||
"passed": 7,
|
||||
"failed": 0,
|
||||
"total": 7,
|
||||
"time_seconds": 38.4,
|
||||
"tokens": 34304,
|
||||
"tool_calls": 10,
|
||||
"errors": 0
|
||||
},
|
||||
"expectations": [
|
||||
{"text": "Poprawna deklaracja XML", "passed": true, "evidence": "Linia 1"},
|
||||
{"text": "xmlns='http://www.enova.pl/schema/form.xsd'", "passed": true, "evidence": "Poprawny namespace"},
|
||||
{"text": "Element Grid", "passed": true, "evidence": "Grid z ViewType=Folder"},
|
||||
{"text": "OrderBy='NazwiskoImie'", "passed": true, "evidence": "Linia 12"},
|
||||
{"text": "IsToolbarVisible i IsFilterRowVisible", "passed": true, "evidence": "Linie 10-11"},
|
||||
{"text": "6 pól: Kod, NazwiskoImie, NumerPracownika, Dzial, Stanowisko, DataZatrudnienia", "passed": true, "evidence": "Linie 13-18"},
|
||||
{"text": "Dzial ma Width='*'", "passed": true, "evidence": "Linia 16"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"eval_id": 2,
|
||||
"eval_name": "viewform-lista",
|
||||
"configuration": "without_skill",
|
||||
"run_number": 1,
|
||||
"result": {
|
||||
"pass_rate": 0.43,
|
||||
"passed": 3,
|
||||
"failed": 4,
|
||||
"total": 7,
|
||||
"time_seconds": 13.9,
|
||||
"tokens": 3821,
|
||||
"tool_calls": 3,
|
||||
"errors": 0
|
||||
},
|
||||
"expectations": [
|
||||
{"text": "Poprawna deklaracja XML", "passed": true, "evidence": "Linia 1"},
|
||||
{"text": "xmlns='http://www.enova.pl/schema/form.xsd'", "passed": false, "evidence": "Błędny namespace: http://www.soneta.pl/viewform"},
|
||||
{"text": "Element Grid", "passed": false, "evidence": "<grid> lowercase — błędna nazwa elementu"},
|
||||
{"text": "OrderBy='NazwiskoImie'", "passed": false, "evidence": "defaultsort= zamiast OrderBy="},
|
||||
{"text": "IsToolbarVisible i IsFilterRowVisible", "passed": false, "evidence": "Atrybuty lowercase — błędna składnia"},
|
||||
{"text": "6 pól: Kod, NazwiskoImie, NumerPracownika, Dzial, Stanowisko, DataZatrudnienia", "passed": true, "evidence": "<column> z właściwymi nazwami"},
|
||||
{"text": "Dzial ma Width='*'", "passed": true, "evidence": "width='*' na Dzial"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"eval_id": 3,
|
||||
"eval_name": "warunkowa-widocznosc",
|
||||
"configuration": "with_skill",
|
||||
"run_number": 1,
|
||||
"result": {
|
||||
"pass_rate": 1.0,
|
||||
"passed": 5,
|
||||
"failed": 0,
|
||||
"total": 5,
|
||||
"time_seconds": 72.7,
|
||||
"tokens": 37262,
|
||||
"tool_calls": 10,
|
||||
"errors": 0
|
||||
},
|
||||
"expectations": [
|
||||
{"text": "Visibility='{?Pole=Wartość}'", "passed": true, "evidence": "Linia 30: {?TypKlienta=Korporacyjny}"},
|
||||
{"text": "OR: {?Status=Premium or Status=VIP}", "passed": true, "evidence": "Linia 48"},
|
||||
{"text": "Negacja: {?!Status=Aktywny}", "passed": true, "evidence": "Linia 65"},
|
||||
{"text": "Class='Collapsable' na Group", "passed": true, "evidence": "Linia 109"},
|
||||
{"text": "Renderable dla licencji", "passed": true, "evidence": "Linia 110: Renderable='{Licence.HAN}'"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"eval_id": 3,
|
||||
"eval_name": "warunkowa-widocznosc",
|
||||
"configuration": "without_skill",
|
||||
"run_number": 1,
|
||||
"result": {
|
||||
"pass_rate": 0.0,
|
||||
"passed": 0,
|
||||
"failed": 5,
|
||||
"total": 5,
|
||||
"time_seconds": 50.3,
|
||||
"tokens": 5896,
|
||||
"tool_calls": 3,
|
||||
"errors": 0
|
||||
},
|
||||
"expectations": [
|
||||
{"text": "Visibility='{?Pole=Wartość}'", "passed": false, "evidence": "RowCondition= (nie istnieje) zamiast Visibility"},
|
||||
{"text": "OR: {?Status=Premium or Status=VIP}", "passed": false, "evidence": "SQL-like OR syntax zamiast {?... or ...}"},
|
||||
{"text": "Negacja: {?!Status=Aktywny}", "passed": false, "evidence": "NOT (...) zamiast {?!...}"},
|
||||
{"text": "Class='Collapsable' na Group", "passed": false, "evidence": "Collapsable='True' atrybut zamiast Class="},
|
||||
{"text": "Renderable dla licencji", "passed": false, "evidence": "Session.Features.IsAvailable() — błędna składnia"}
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
"run_summary": {
|
||||
"with_skill": {
|
||||
"pass_rate": {"mean": 1.0, "stddev": 0.0, "min": 1.0, "max": 1.0},
|
||||
"time_seconds": {"mean": 49.6, "stddev": 18.1, "min": 37.8, "max": 72.7},
|
||||
"tokens": {"mean": 34758, "stddev": 2284, "min": 32707, "max": 37262}
|
||||
},
|
||||
"without_skill": {
|
||||
"pass_rate": {"mean": 0.31, "stddev": 0.22, "min": 0.0, "max": 0.5},
|
||||
"time_seconds": {"mean": 27.3, "stddev": 19.1, "min": 13.9, "max": 50.3},
|
||||
"tokens": {"mean": 4660, "stddev": 1080, "min": 3821, "max": 5896}
|
||||
},
|
||||
"delta": {
|
||||
"pass_rate": "+0.69",
|
||||
"time_seconds": "+22.3",
|
||||
"tokens": "+30098"
|
||||
}
|
||||
},
|
||||
|
||||
"notes": [
|
||||
"Skill daje 100% pass rate we wszystkich 3 ewalach vs 31% bez skilla — poprawa o 69 punktów procentowych",
|
||||
"Bez skilla model halucynuje nieistniejące elementy XML: <PageForm>, <Control>, <viewform>, <Edit>, RowCondition=",
|
||||
"Najsłabszy wynik bez skilla: warunkowa-widocznosc (0%) — model zna pojęcia ale ma zupełnie błędną składnię",
|
||||
"Skill kosztuje więcej tokenów (30k vs 4.7k) i czasu (50s vs 27s) — uzasadnione biorąc pod uwagę czytanie SKILL.md i ELEMENTS.md",
|
||||
"Asercja 'Poprawna deklaracja XML' zawsze przechodzi — mało dyskryminująca, ale utrzymana dla kompletności"
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user