@@ -94,27 +94,30 @@ class MTBenchEvaluator(AbstractMTBenchEvaluator):
9494
9595 name = "mt_bench"
9696
97- def gen_answers (self , server_url ) -> None :
97+ def gen_answers (self , server_url , api_key : str | None = None ) -> None :
9898 """
9999 Asks questions to model
100100
101101 Attributes
102102 server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
103+ api_key API token for authenticating with model server
103104 """
104105 logger .debug (locals ())
105106 mt_bench_answers .generate_answers (
106107 self .model_name ,
107108 server_url ,
109+ api_key = api_key ,
108110 output_dir = self .output_dir ,
109111 max_workers = self .max_workers ,
110112 )
111113
112- def judge_answers (self , server_url ) -> tuple :
114+ def judge_answers (self , server_url , api_key : str | None = None ) -> tuple :
113115 """
114116 Runs MT-Bench judgment
115117
116118 Attributes
117119 server_url Model server endpoint (Ex: http://localhost:8000/v1) for the judge model
120+ api_key API token for authenticating with model server
118121
119122 Returns:
120123 overall_score MT-Bench score for the overall model evaluation
@@ -126,6 +129,7 @@ def judge_answers(self, server_url) -> tuple:
126129 self .model_name ,
127130 self .judge_model_name ,
128131 server_url ,
132+ api_key = api_key ,
129133 max_workers = self .max_workers ,
130134 output_dir = self .output_dir ,
131135 merge_system_user_message = self .merge_system_user_message ,
@@ -171,12 +175,13 @@ def __init__(
171175 self .taxonomy_git_repo_path = taxonomy_git_repo_path
172176 self .branch = branch
173177
174- def gen_answers (self , server_url ) -> None :
178+ def gen_answers (self , server_url , api_key : str | None = None ) -> None :
175179 """
176180 Asks questions to model
177181
178182 Attributes
179183 server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
184+ api_key API token for authenticating with model server
180185 """
181186 logger .debug (locals ())
182187 mt_bench_branch_generator .generate (
@@ -188,19 +193,21 @@ def gen_answers(self, server_url) -> None:
188193 mt_bench_answers .generate_answers (
189194 self .model_name ,
190195 server_url ,
196+ api_key = api_key ,
191197 branch = self .branch ,
192198 output_dir = self .output_dir ,
193199 data_dir = self .output_dir ,
194200 max_workers = self .max_workers ,
195201 bench_name = "mt_bench_branch" ,
196202 )
197203
198- def judge_answers (self , server_url ) -> tuple :
204+ def judge_answers (self , server_url , api_key : str | None = None ) -> tuple :
199205 """
200206 Runs MT-Bench-Branch judgment. Judgments can be compared across runs with consistent question_id -> qna file name.
201207
202208 Attributes
203209 server_url Model server endpoint (Ex: http://localhost:8000/v1) for the judge model
210+ api_key API token for authenticating with model server
204211
205212 Returns:
206213 qa_pairs Question and answer pairs (with scores) from the evaluation
@@ -210,6 +217,7 @@ def judge_answers(self, server_url) -> tuple:
210217 self .model_name ,
211218 self .judge_model_name ,
212219 server_url ,
220+ api_key = api_key ,
213221 branch = self .branch ,
214222 max_workers = self .max_workers ,
215223 output_dir = self .output_dir ,
0 commit comments