🎯 Qwen2.5-7B-Instruct

🎯 Qwen2.5-7B-Instruct#

vLLM Version: vLLM: 0.9.1 (b6553be), vLLM Ascend: main (ca884ef) vLLM Engine: V0
Software Environment: CANN: 8.1.RC1, PyTorch: 2.5.1, torch-npu: 2.5.1.post1.dev20250619
Hardware Environment: Atlas A2 Series
Datasets: ceval-valid,gsm8k
Command:

export MODEL_ARGS='pretrained=Qwen/Qwen2.5-7B-Instruct,max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'
lm_eval --model vllm --model_args $MODEL_ARGS --tasks ceval-valid,gsm8k \ 
--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1

Task

Filter

n-shot

Metric

Value

Stderr

ceval-valid

none

5

acc_norm

✅0.8001

± 0.0105

gsm8k

flexible-extract

5

exact_match

✅0.7278

± 0.0123

ceval-valid details

Task

Filter

n-shot

Metric

Value

Stderr

ceval-valid

none

5

acc_norm

✅0.8001

± 0.0105

- ceval-valid_accountant

none

5

acc

0.8776

± 0.0473

- ceval-valid_advanced_mathematics

none

5

acc

0.4211

± 0.1164

- ceval-valid_art_studies

none

5

acc

0.7273

± 0.0787

- ceval-valid_basic_medicine

none

5

acc

0.9474

± 0.0526

- ceval-valid_business_administration

none

5

acc

0.8485

± 0.0634

- ceval-valid_chinese_language_and_literature

none

5

acc

0.6087

± 0.1041

- ceval-valid_civil_servant

none

5

acc

0.8298

± 0.0554

- ceval-valid_clinical_medicine

none

5

acc

0.7727

± 0.0914

- ceval-valid_college_chemistry

none

5

acc

0.6250

± 0.1009

- ceval-valid_college_economics

none

5

acc

0.7455

± 0.0593

- ceval-valid_college_physics

none

5

acc

0.7368

± 0.1038

- ceval-valid_college_programming

none

5

acc

0.8649

± 0.0570

- ceval-valid_computer_architecture

none

5

acc

0.7143

± 0.1010

- ceval-valid_computer_network

none

5

acc

0.6842

± 0.1096

- ceval-valid_discrete_mathematics

none

5

acc

0.2500

± 0.1118

- ceval-valid_education_science

none

5

acc

0.8621

± 0.0652

- ceval-valid_electrical_engineer

none

5

acc

0.6757

± 0.0780

- ceval-valid_environmental_impact_assessment_engineer

none

5

acc

0.7419

± 0.0799

- ceval-valid_fire_engineer

none

5

acc

0.7419

± 0.0799

- ceval-valid_high_school_biology

none

5

acc

0.8947

± 0.0723

- ceval-valid_high_school_chemistry

none

5

acc

0.7368

± 0.1038

- ceval-valid_high_school_chinese

none

5

acc

0.6842

± 0.1096

- ceval-valid_high_school_geography

none

5

acc

0.8947

± 0.0723

- ceval-valid_high_school_history

none

5

acc

0.9000

± 0.0688

- ceval-valid_high_school_mathematics

none

5

acc

0.5000

± 0.1213

- ceval-valid_high_school_physics

none

5

acc

0.7368

± 0.1038

- ceval-valid_high_school_politics

none

5

acc

0.8947

± 0.0723

- ceval-valid_ideological_and_moral_cultivation

none

5

acc

0.9474

± 0.0526

- ceval-valid_law

none

5

acc

0.6667

± 0.0983

- ceval-valid_legal_professional

none

5

acc

0.7391

± 0.0936

- ceval-valid_logic

none

5

acc

0.6364

± 0.1050

- ceval-valid_mao_zedong_thought

none

5

acc

0.9583

± 0.0417

- ceval-valid_marxism

none

5

acc

0.9474

± 0.0526

- ceval-valid_metrology_engineer

none

5

acc

0.8333

± 0.0777

- ceval-valid_middle_school_biology

none

5

acc

0.9524

± 0.0476

- ceval-valid_middle_school_chemistry

none

5

acc

0.9500

± 0.0500

- ceval-valid_middle_school_geography

none

5

acc

0.9167

± 0.0833

- ceval-valid_middle_school_history

none

5

acc

0.9091

± 0.0627

- ceval-valid_middle_school_mathematics

none

5

acc

0.6842

± 0.1096

- ceval-valid_middle_school_physics

none

5

acc

0.9474

± 0.0526

- ceval-valid_middle_school_politics

none

5

acc

1.0000

± 0.0000

- ceval-valid_modern_chinese_history

none

5

acc

0.9130

± 0.0601

- ceval-valid_operating_system

none

5

acc

0.8421

± 0.0859

- ceval-valid_physician

none

5

acc

0.8367

± 0.0533

- ceval-valid_plant_protection

none

5

acc

0.8636

± 0.0749

- ceval-valid_probability_and_statistics

none

5

acc

0.5556

± 0.1205

- ceval-valid_professional_tour_guide

none

5

acc

0.8966

± 0.0576

- ceval-valid_sports_science

none

5

acc

0.9474

± 0.0526

- ceval-valid_tax_accountant

none

5

acc

0.8571

± 0.0505

- ceval-valid_teacher_qualification

none

5

acc

0.9091

± 0.0438

- ceval-valid_urban_and_rural_planner

none

5

acc

0.8043

± 0.0591

- ceval-valid_veterinary_medicine

none

5

acc

0.8261

± 0.0808

gsm8k details

Task

Filter

n-shot

Metric

Value

Stderr

gsm8k

flexible-extract

5

exact_match

✅0.7278

± 0.0123