Kerem G\u00fclen\/Midjourney<\/a><\/strong><\/p>\n","protected":false},"excerpt":{"rendered":"As large language models (LLMs) become increasingly sophisticated, ensuring fair and unbiased evaluation has become a critical challenge. Existing evaluation protocols often suffer from benchmark contamination, where models are trained on datasets that include portions of the test benchmarks, leading to artificially inflated results. A recent approach known as Agents-as-an-Evaluator attempts to address this issue […]<\/p>\n","protected":false},"author":584,"featured_media":64997,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"jnews-multi-image_gallery":[],"jnews_single_post":{"format":"standard","override":[{"template":"5","layout":"right-sidebar","sidebar":"default-sidebar","second_sidebar":"default-sidebar","share_position":"float","share_float_style":"share-normal","show_share_counter":"1","show_view_counter":"1","show_featured":"1","show_post_meta":"1","show_post_author":"1","show_post_author_image":"1","show_post_date":"1","post_date_format":"default","post_date_format_custom":"Y\/m\/d","show_post_category":"1","show_post_reading_time":"0","post_reading_time_wpm":"300","post_calculate_word_method":"str_word_count","zoom_button_out_step":"2","zoom_button_in_step":"3","show_post_tag":"1","number_popup_post":"1","show_author_box":"0","show_post_related":"1","show_inline_post_related":"0"}],"image_override":[{"single_post_thumbnail_size":"no-crop","single_post_gallery_size":"crop-715"}],"trending_post_position":"meta","trending_post_label":"Trending","sponsored_post_label":"Sponsored by","disable_ad":"0","subtitle":"Researchers from Hikvision Research Institute propose a new evaluation framework called the Unbiased Evaluator to mitigate LLM performance score biases"},"jnews_primary_category":[],"jnews_social_meta":[],"jnews_override_counter":{"view_counter_number":"0","share_counter_number":"0","like_counter_number":"0","dislike_counter_number":"0"},"footnotes":""},"categories":[17672],"tags":[963,2610,10645,15740],"coauthors":[9895],"class_list":["post-64996","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-research","tag-ai","tag-featured","tag-large-language-models","tag-llm"],"jnews_single_post":{"format":"standard","override":[{"template":"5","layout":"right-sidebar","sidebar":"default-sidebar","second_sidebar":"default-sidebar","share_position":"float","share_float_style":"share-normal","show_share_counter":"1","show_view_counter":"1","show_featured":"1","show_post_meta":"1","show_post_author":"1","show_post_author_image":"1","show_post_date":"1","post_date_format":"default","post_date_format_custom":"Y\/m\/d","show_post_category":"1","show_post_reading_time":"0","post_reading_time_wpm":"300","post_calculate_word_method":"str_word_count","zoom_button_out_step":"2","zoom_button_in_step":"3","show_post_tag":"1","number_popup_post":"1","show_author_box":"0","show_post_related":"1","show_inline_post_related":"0"}],"image_override":[{"single_post_thumbnail_size":"no-crop","single_post_gallery_size":"crop-715"}],"trending_post_position":"meta","trending_post_label":"Trending","sponsored_post_label":"Sponsored by","disable_ad":"0","subtitle":"Researchers from Hikvision Research Institute propose a new evaluation framework called the Unbiased Evaluator to mitigate LLM performance score biases"},"rank_math_description":null,"_links":{"self":[{"href":"https:\/\/dataconomy.ru\/wp-json\/wp\/v2\/posts\/64996","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/dataconomy.ru\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/dataconomy.ru\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/dataconomy.ru\/wp-json\/wp\/v2\/users\/584"}],"replies":[{"embeddable":true,"href":"https:\/\/dataconomy.ru\/wp-json\/wp\/v2\/comments?post=64996"}],"version-history":[{"count":"1","href":"https:\/\/dataconomy.ru\/wp-json\/wp\/v2\/posts\/64996\/revisions"}],"predecessor-version":[{"id":64998,"href":"https:\/\/dataconomy.ru\/wp-json\/wp\/v2\/posts\/64996\/revisions\/64998"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/dataconomy.ru\/wp-json\/wp\/v2\/media\/64997"}],"wp:attachment":[{"href":"https:\/\/dataconomy.ru\/wp-json\/wp\/v2\/media?parent=64996"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/dataconomy.ru\/wp-json\/wp\/v2\/categories?post=64996"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/dataconomy.ru\/wp-json\/wp\/v2\/tags?post=64996"},{"taxonomy":"author","embeddable":true,"href":"https:\/\/dataconomy.ru\/wp-json\/wp\/v2\/coauthors?post=64996"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}