Unwrap vm_call_cfunc indirection on JIT

for VM_METHOD_TYPE_CFUNC. This has been known to decrease optcarrot fps: ``` $ benchmark-driver -v --rbenv 'before --jit;after --jit' benchmark.yml --repeat-count=24 --output=all before --jit: ruby 2.8.0dev (2020-04-13T16:25:13Z master fb40495c) +JIT [x86_64-linux] after --jit: ruby 2.8.0dev (2020-04-13T23:23:11Z mjit-inline-c bdcd06d159) +JIT [x86_64-linux] Calculating ------------------------------------- before --jit after --jit Optcarrot Lan_Master.nes 66.38132676191719 67.41369177299630 fps 69.42728743772243 68.90327567263054 72.16028300263211 69.62605130880686 72.46631319102777 70.48818243767207 73.37078877002490 70.79522887347566 73.69422431217367 70.99021920193194 74.01471487018695 74.69931965402584 75.48685183295630 74.86714575949016 75.54445264507932 75.97864419721677 77.28089738169756 76.48908637569581 78.04183397891302 76.54320932488021 78.36807984096562 76.59407262898067 78.92898762543574 77.31316743361343 78.93576483233765 77.97153484180480 79.13754917503078 77.98478782102325 79.62648945850653 78.02263322726446 79.86334213878064 78.26333724045934 80.05100635898518 78.60056756355614 80.26186843769584 78.91082645644468 80.34205717020330 79.01226659142263 80.62286066044338 79.32733939423721 80.95883033058557 79.63793060542024 80.97376819251613 79.73108936622778 81.23050939202896 80.18280109433088 ``` and I deleted this capability in an early stage of YARV-MJIT development: https://github.com/k0kubun/yarv-mjit/commit/0ab130feeefc2b9078a1077e4fec93b3f5e45d07 I suspect either of the following things could be the cause: * Directly calling vm_call_cfunc requires more optimization effort in GCC, resulting in 30ms-ish compilation time increase for such methods and decreasing the number of methods compiled in a benchmarked period. * Code size increase => icache miss hit These hypotheses could be verified by some methodologies. However, I'd like to introduce this regardless of the result because this blocks inlining C method's definition. I may revert this commit when I give up to implement inlining C method definition, which requires this change. Microbenchmark-wise, this gives slight performance improvement: ``` $ benchmark-driver -v --rbenv 'before --jit;after --jit' benchmark/mjit_send_cfunc.yml --repeat-count=4 before --jit: ruby 2.8.0dev (2020-04-13T16:25:13Z master fb40495c) +JIT [x86_64-linux] after --jit: ruby 2.8.0dev (2020-04-13T23:23:11Z mjit-inline-c bdcd06d159) +JIT [x86_64-linux] Calculating ------------------------------------- before --jit after --jit mjit_send_cfunc 41.961M 56.489M i/s - 100.000M times in 2.383143s 1.770244s Comparison: mjit_send_cfunc after --jit: 56489372.5 i/s before --jit: 41961388.1 i/s - 1.35x slower ```

Unwrap vm_call_cfunc indirection on JIT
for VM_METHOD_TYPE_CFUNC. This has been known to decrease optcarrot fps: ``` $ benchmark-driver -v --rbenv 'before --jit;after --jit' benchmark.yml --repeat-count=24 --output=all before --jit: ruby 2.8.0dev (2020-04-13T16:25:13Z master fb40495c) +JIT [x86_64-linux] after --jit: ruby 2.8.0dev (2020-04-13T23:23:11Z mjit-inline-c bdcd06d159) +JIT [x86_64-linux] Calculating ------------------------------------- before --jit after --jit Optcarrot Lan_Master.nes 66.38132676191719 67.41369177299630 fps 69.42728743772243 68.90327567263054 72.16028300263211 69.62605130880686 72.46631319102777 70.48818243767207 73.37078877002490 70.79522887347566 73.69422431217367 70.99021920193194 74.01471487018695 74.69931965402584 75.48685183295630 74.86714575949016 75.54445264507932 75.97864419721677 77.28089738169756 76.48908637569581 78.04183397891302 76.54320932488021 78.36807984096562 76.59407262898067 78.92898762543574 77.31316743361343 78.93576483233765 77.97153484180480 79.13754917503078 77.98478782102325 79.62648945850653 78.02263322726446 79.86334213878064 78.26333724045934 80.05100635898518 78.60056756355614 80.26186843769584 78.91082645644468 80.34205717020330 79.01226659142263 80.62286066044338 79.32733939423721 80.95883033058557 79.63793060542024 80.97376819251613 79.73108936622778 81.23050939202896 80.18280109433088 ``` and I deleted this capability in an early stage of YARV-MJIT development: https://github.com/k0kubun/yarv-mjit/commit/0ab130feeefc2b9078a1077e4fec93b3f5e45d07 I suspect either of the following things could be the cause: * Directly calling vm_call_cfunc requires more optimization effort in GCC, resulting in 30ms-ish compilation time increase for such methods and decreasing the number of methods compiled in a benchmarked period. * Code size increase => icache miss hit These hypotheses could be verified by some methodologies. However, I'd like to introduce this regardless of the result because this blocks inlining C method's definition. I may revert this commit when I give up to implement inlining C method definition, which requires this change. Microbenchmark-wise, this gives slight performance improvement: ``` $ benchmark-driver -v --rbenv 'before --jit;after --jit' benchmark/mjit_send_cfunc.yml --repeat-count=4 before --jit: ruby 2.8.0dev (2020-04-13T16:25:13Z master fb40495c) +JIT [x86_64-linux] after --jit: ruby 2.8.0dev (2020-04-13T23:23:11Z mjit-inline-c bdcd06d159) +JIT [x86_64-linux] Calculating ------------------------------------- before --jit after --jit mjit_send_cfunc 41.961M 56.489M i/s - 100.000M times in 2.383143s 1.770244s Comparison: mjit_send_cfunc after --jit: 56489372.5 i/s before --jit: 41961388.1 i/s - 1.35x slower ```
b9d3ceee · Takashi Kokubun · fb40495c · b9d3ceee · b9d3ceee
隐藏空白更改
内联并排

Showing with 38 addition and 21 deletion

benchmark/mjit_send_cfunc.yml benchmark/mjit_send_cfunc.yml +6 -0

tool/ruby_vm/views/_mjit_compile_send.erb tool/ruby_vm/views/_mjit_compile_send.erb +32 -21

未找到文件。
--- a/benchmark/mjit_send_cfunc.yml
+++ b/benchmark/mjit_send_cfunc.yml
+prelude: |
+  def mjit_send_cfunc
+    self.class
+  end
+benchmark: mjit_send_cfunc
+loop_count: 100000000
--- a/tool/ruby_vm/views/_mjit_compile_send.erb
+++ b/tool/ruby_vm/views/_mjit_compile_send.erb
@@ -15,15 +15,18 @@
 % # compiler: Use captured cc to avoid race condition
    const struct rb_callcache *captured_cc = captured_cc_entries(status)[call_data_index(cd, body)];
 %
-    const rb_iseq_t *iseq;
+% # compiler: Inline send insn where some supported fastpath is used.
+    const rb_iseq_t *iseq = NULL;
    const CALL_INFO ci = cd->ci;
-    if (!status->compile_info->disable_send_cache && has_valid_method_type(captured_cc)
-        // CC_SET_FASTPATH in vm_callee_setup_arg
-        && !(vm_ci_flag(ci) & VM_CALL_TAILCALL) // inlining non-tailcall path
-        && vm_cc_cme(captured_cc)->def->type == VM_METHOD_TYPE_ISEQ
-        && fastpath_applied_iseq_p(ci, captured_cc, iseq = def_iseq_ptr(vm_cc_cme(captured_cc)->def))) {
-
-        int param_size = iseq->body->param.size;
+    if (!status->compile_info->disable_send_cache && has_valid_method_type(captured_cc) && (
+%       # `CC_SET_FASTPATH(cc, vm_call_cfunc, TRUE)` in `vm_call_method_each_type`
+        vm_cc_cme(captured_cc)->def->type == VM_METHOD_TYPE_CFUNC
+%       # `CC_SET_FASTPATH(cc, vm_call_iseq_setup_func(...), vm_call_iseq_optimizable_p(...))` in `vm_callee_setup_arg`,
+%       # and support only non-VM_CALL_TAILCALL path inside it
+        || (vm_cc_cme(captured_cc)->def->type == VM_METHOD_TYPE_ISEQ
+            && fastpath_applied_iseq_p(ci, captured_cc, iseq = def_iseq_ptr(vm_cc_cme(captured_cc)->def))
+            && !(vm_ci_flag(ci) & VM_CALL_TAILCALL))
+    )) {
        int sp_inc = (int)sp_inc_of_sendish(ci);
        fprintf(f, "{\n");

@@ -40,7 +43,7 @@
 <%= render 'mjit_compile_pc_and_sp', locals: { insn: insn } -%>

 % # JIT: If ISeq is inlinable, call the inlined method without pushing a frame.
-        if (status->inlined_iseqs != NULL && status->inlined_iseqs[pos] == iseq->body) {
+        if (iseq && status->inlined_iseqs != NULL && iseq->body == status->inlined_iseqs[pos]) {
            fprintf(f, "    {\n");
            fprintf(f, "        VALUE orig_self = reg_cfp->self;\n");
            fprintf(f, "        reg_cfp->self = stack[%d];\n", b->stack_size + sp_inc - 1);
@@ -49,7 +52,7 @@
            fprintf(f, "    }\n");
        }
        else {
-% # JIT: Forked `vm_sendish` to inline various things
+% # JIT: Forked `vm_sendish` (except method_explorer = vm_search_method_wrap) to inline various things
            fprintf(f, "    {\n");
            fprintf(f, "        VALUE val;\n");
            fprintf(f, "        struct rb_calling_info calling;\n");
@@ -58,20 +61,28 @@
 % else
            fprintf(f, "        calling.block_handler = VM_BLOCK_HANDLER_NONE;\n");
 % end
-            fprintf(f, "        calling.argc = %d;\n", vm_ci_argc(ci));
+            fprintf(f, "        calling.kw_splat = %d;\n", IS_ARGS_KW_SPLAT(ci) > 0);
            fprintf(f, "        calling.recv = stack[%d];\n", b->stack_size + sp_inc - 1);
+            fprintf(f, "        calling.argc = %d;\n", vm_ci_argc(ci));

-%           # fastpath_applied_iseq_p checks rb_simple_iseq_p, which ensures has_opt == FALSE
-            fprintf(f, "        vm_call_iseq_setup_normal(ec, reg_cfp, &calling, cc_cme, 0, %d, %d);\n", param_size, iseq->body->local_table_size);
-            if (iseq->body->catch_except_p) {
-                fprintf(f, "        VM_ENV_FLAGS_SET(ec->cfp->ep, VM_FRAME_FLAG_FINISH);\n");
-                fprintf(f, "        val = vm_exec(ec, TRUE);\n");
+            if (vm_cc_cme(captured_cc)->def->type == VM_METHOD_TYPE_CFUNC) {
+%               # TODO: optimize this more
+                fprintf(f, "        CALL_DATA cd = (CALL_DATA)0x%"PRIxVALUE";\n", operands[0]);
+                fprintf(f, "        val = vm_call_cfunc(ec, reg_cfp, &calling, cd);\n");
            }
-            else {
-                fprintf(f, "        if ((val = mjit_exec(ec)) == Qundef) {\n");
-                fprintf(f, "            VM_ENV_FLAGS_SET(ec->cfp->ep, VM_FRAME_FLAG_FINISH);\n"); // This is vm_call0_body's code after vm_call_iseq_setup
-                fprintf(f, "            val = vm_exec(ec, FALSE);\n");
-                fprintf(f, "        }\n");
+            else { // VM_METHOD_TYPE_ISEQ
+%               # fastpath_applied_iseq_p checks rb_simple_iseq_p, which ensures has_opt == FALSE
+                fprintf(f, "        vm_call_iseq_setup_normal(ec, reg_cfp, &calling, cc_cme, 0, %d, %d);\n", iseq->body->param.size, iseq->body->local_table_size);
+                if (iseq->body->catch_except_p) {
+                    fprintf(f, "        VM_ENV_FLAGS_SET(ec->cfp->ep, VM_FRAME_FLAG_FINISH);\n");
+                    fprintf(f, "        val = vm_exec(ec, TRUE);\n");
+                }
+                else {
+                    fprintf(f, "        if ((val = mjit_exec(ec)) == Qundef) {\n");
+                    fprintf(f, "            VM_ENV_FLAGS_SET(ec->cfp->ep, VM_FRAME_FLAG_FINISH);\n"); // This is vm_call0_body's code after vm_call_iseq_setup
+                    fprintf(f, "            val = vm_exec(ec, FALSE);\n");
+                    fprintf(f, "        }\n");
+                }
            }
            fprintf(f, "        stack[%d] = val;\n", b->stack_size + sp_inc - 1);
            fprintf(f, "    }\n");