rewrite silu and softmax for cpu by jart · Pull Request #7154 · ggerganov/llama.cpp · GitHub

Date: New issue Have a question about this project? Sign up for a free

Location: github.com

📈 llama.cpp server for bench-server-baseline on Standard_NC4as_T4_v3 for phi-2-q4_0: 543 iterations 🚀

Expand details for performance related PR only
  • Concurrent users: 8, duration: 10m
  • HTTP request : avg=8626.19ms p(95)=21696.44ms fails=, finish reason: stop=474 truncated=69
  • Prompt processing (pp): avg=94.59tk/s p(95)=412.43tk/s
  • Token generation (tg): avg=33.43tk/s p(95)=48.33tk/s
  • ggml-org/models/phi-2/ggml-model-q4_0.gguf parallel=8 ctx-size=16384 ngl=33 batch-size=2048 ubatch-size=256 pp=1024 pp+tg=2048 branch=expf commit=d7359a389c236193edac1c8761e6ac98844654f3
More
---
config:
    xyChart:
        titleFontSize: 12
        width: 900
        height: 600
    themeVariables:
        xyChart:
            titleColor: "#000000"
---
xychart-beta
    title "llama.cpp bench-server-baseline on Standard_NC4as_T4_v3
 duration=10m 543 iterations"
    y-axis "llamacpp:prompt_tokens_seconds"
    x-axis "llamacpp:prompt_tokens_seconds" 1715376005 --> 1715376631
    line [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 676.15, 676.15, 676.15, 676.15, 676.15, 693.38, 693.38, 693.38, 693.38, 693.38, 686.03, 686.03, 686.03, 686.03, 686.03, 716.71, 716.71, 716.71, 716.71, 716.71, 787.67, 787.67, 787.67, 787.67, 787.67, 798.67, 798.67, 798.67, 798.67, 798.67, 798.41, 798.41, 798.41, 798.41, 798.41, 816.18, 816.18, 816.18, 816.18, 816.18, 816.66, 816.66, 816.66, 816.66, 816.66, 826.24, 826.24, 826.24, 826.24, 826.24, 827.91, 827.91, 827.91, 827.91, 827.91, 839.83, 839.83, 839.83, 839.83, 839.83, 845.37, 845.37, 845.37, 845.37, 845.37, 891.54, 891.54, 891.54, 891.54, 891.54, 896.52, 896.52, 896.52, 896.52, 896.52, 898.39, 898.39, 898.39, 898.39, 898.39, 896.16, 896.16, 896.16, 896.16, 896.16, 909.86, 909.86, 909.86, 909.86, 909.86, 901.74, 901.74, 901.74, 901.74, 901.74, 898.93, 898.93, 898.93, 898.93, 898.93, 900.17, 900.17, 900.17, 900.17, 900.17, 901.19, 901.19, 901.19, 901.19, 901.19, 901.37, 901.37, 901.37, 901.37, 901.37, 914.57, 914.57, 914.57, 914.57, 914.57, 913.27, 913.27, 913.27, 913.27, 913.27, 914.12, 914.12, 914.12, 914.12, 914.12, 884.7, 884.7, 884.7, 884.7, 884.7, 880.58, 880.58, 880.58, 880.58, 880.58, 874.62, 874.62, 874.62, 874.62, 874.62, 874.44, 874.44, 874.44, 874.44, 874.44, 878.93, 878.93, 878.93, 878.93, 878.93, 876.59, 876.59, 876.59, 876.59, 876.59, 879.89, 879.89, 879.89, 879.89, 879.89, 889.29, 889.29, 889.29, 889.29, 889.29, 896.06, 896.06, 896.06, 896.06, 896.06, 895.27, 895.27, 895.27, 895.27, 895.27, 898.07, 898.07, 898.07, 898.07, 898.07, 895.61, 895.61, 895.61, 895.61, 895.61, 898.03, 898.03, 898.03, 898.03, 898.03, 900.02, 900.02, 900.02, 900.02, 900.02, 903.55, 903.55, 903.55, 903.55, 903.55, 912.38, 912.38, 912.38, 912.38, 912.38, 913.02, 913.02, 913.02, 913.02, 913.02, 909.18, 909.18, 909.18, 909.18, 909.18, 908.34, 908.34, 908.34, 908.34, 908.34, 904.61, 904.61, 904.61, 904.61, 904.61, 904.91, 904.91, 904.91, 904.91, 904.91, 909.01, 909.01, 909.01, 909.01, 909.01, 908.42, 908.42, 908.42, 908.42, 908.42, 913.16, 913.16, 913.16, 913.16, 913.16, 912.15, 912.15, 912.15, 912.15, 912.15, 914.4, 914.4, 914.4, 914.4, 914.4, 917.57, 917.57, 917.57, 917.57, 917.57, 915.58, 915.58, 915.58, 915.58, 915.58, 920.75, 920.75, 920.75, 920.75, 920.75, 919.24, 919.24, 919.24, 919.24, 919.24, 920.07, 920.07, 920.07, 920.07, 920.07, 918.79, 918.79, 918.79, 918.79, 918.79, 917.24, 917.24, 917.24, 917.24, 917.24, 918.44, 918.44, 918.44, 918.44, 918.44, 918.61, 918.61, 918.61, 918.61]
                    
More
---
config:
    xyChart:
        titleFontSize: 12
        width: 900
        height: 600
    themeVariables:
        xyChart:
            titleColor: "#000000"
---
xychart-beta
    title "llama.cpp bench-server-baseline on Standard_NC4as_T4_v3
 duration=10m 543 iterations"
    y-axis "llamacpp:predicted_tokens_seconds"
    x-axis "llamacpp:predicted_tokens_seconds" 1715376005 --> 1715376631
    line [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 41.33, 41.33, 41.33, 41.33, 41.33, 35.68, 35.68, 35.68, 35.68, 35.68, 29.47, 29.47, 29.47, 29.47, 29.47, 28.84, 28.84, 28.84, 28.84, 28.84, 30.64, 30.64, 30.64, 30.64, 30.64, 31.13, 31.13, 31.13, 31.13, 31.13, 32.39, 32.39, 32.39, 32.39, 32.39, 33.65, 33.65, 33.65, 33.65, 33.65, 33.61, 33.61, 33.61, 33.61, 33.61, 33.73, 33.73, 33.73, 33.73, 33.73, 33.4, 33.4, 33.4, 33.4, 33.4, 33.78, 33.78, 33.78, 33.78, 33.78, 33.62, 33.62, 33.62, 33.62, 33.62, 32.91, 32.91, 32.91, 32.91, 32.91, 32.27, 32.27, 32.27, 32.27, 32.27, 32.39, 32.39, 32.39, 32.39, 32.39, 32.5, 32.5, 32.5, 32.5, 32.5, 32.5, 32.5, 32.5, 32.5, 32.5, 32.07, 32.07, 32.07, 32.07, 32.07, 31.93, 31.93, 31.93, 31.93, 31.93, 31.67, 31.67, 31.67, 31.67, 31.67, 31.58, 31.58, 31.58, 31.58, 31.58, 31.79, 31.79, 31.79, 31.79, 31.79, 31.57, 31.57, 31.57, 31.57, 31.57, 31.78, 31.78, 31.78, 31.78, 31.78, 32.01, 32.01, 32.01, 32.01, 32.01, 32.02, 32.02, 32.02, 32.02, 32.02, 31.52, 31.52, 31.52, 31.52, 31.52, 31.35, 31.35, 31.35, 31.35, 31.35, 31.45, 31.45, 31.45, 31.45, 31.45, 31.65, 31.65, 31.65, 31.65, 31.65, 31.8, 31.8, 31.8, 31.8, 31.8, 32.01, 32.01, 32.01, 32.01, 32.01, 32.12, 32.12, 32.12, 32.12, 32.12, 32.05, 32.05, 32.05, 32.05, 32.05, 31.82, 31.82, 31.82, 31.82, 31.82, 31.67, 31.67, 31.67, 31.67, 31.67, 31.73, 31.73, 31.73, 31.73, 31.73, 31.87, 31.87, 31.87, 31.87, 31.87, 31.99, 31.99, 31.99, 31.99, 31.99, 32.1, 32.1, 32.1, 32.1, 32.1, 32.02, 32.02, 32.02, 32.02, 32.02, 31.97, 31.97, 31.97, 31.97, 31.97, 31.31, 31.31, 31.31, 31.31, 31.31, 30.76, 30.76, 30.76, 30.76, 30.76, 30.0, 30.0, 30.0, 30.0, 30.0, 29.71, 29.71, 29.71, 29.71, 29.71, 29.65, 29.65, 29.65, 29.65, 29.65, 29.82, 29.82, 29.82, 29.82, 29.82, 29.85, 29.85, 29.85, 29.85, 29.85, 29.95, 29.95, 29.95, 29.95, 29.95, 29.98, 29.98, 29.98, 29.98, 29.98, 30.01, 30.01, 30.01, 30.01, 30.01, 29.85, 29.85, 29.85, 29.85, 29.85, 29.78, 29.78, 29.78, 29.78, 29.78, 29.74, 29.74, 29.74, 29.74, 29.74, 29.88, 29.88, 29.88, 29.88, 29.88, 30.01, 30.01, 30.01, 30.01, 30.01, 30.1, 30.1, 30.1, 30.1, 30.1, 30.18, 30.18, 30.18, 30.18, 30.18, 30.28, 30.28, 30.28, 30.28]
                    
Details
More
---
config:
    xyChart:
        titleFontSize: 12
        width: 900
        height: 600
    themeVariables:
        xyChart:
            titleColor: "#000000"
---
xychart-beta
    title "llama.cpp bench-server-baseline on Standard_NC4as_T4_v3
 duration=10m 543 iterations"
    y-axis "llamacpp:kv_cache_usage_ratio"
    x-axis "llamacpp:kv_cache_usage_ratio" 1715376005 --> 1715376631
    line [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.24, 0.24, 0.24, 0.24, 0.24, 0.38, 0.38, 0.38, 0.38, 0.38, 0.23, 0.23, 0.23, 0.23, 0.23, 0.12, 0.12, 0.12, 0.12, 0.12, 0.21, 0.21, 0.21, 0.21, 0.21, 0.11, 0.11, 0.11, 0.11, 0.11, 0.13, 0.13, 0.13, 0.13, 0.13, 0.15, 0.15, 0.15, 0.15, 0.15, 0.18, 0.18, 0.18, 0.18, 0.18, 0.22, 0.22, 0.22, 0.22, 0.22, 0.12, 0.12, 0.12, 0.12, 0.12, 0.12, 0.12, 0.12, 0.12, 0.12, 0.26, 0.26, 0.26, 0.26, 0.26, 0.32, 0.32, 0.32, 0.32, 0.32, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.18, 0.18, 0.18, 0.18, 0.18, 0.3, 0.3, 0.3, 0.3, 0.3, 0.28, 0.28, 0.28, 0.28, 0.28, 0.32, 0.32, 0.32, 0.32, 0.32, 0.21, 0.21, 0.21, 0.21, 0.21, 0.17, 0.17, 0.17, 0.17, 0.17, 0.15, 0.15, 0.15, 0.15, 0.15, 0.14, 0.14, 0.14, 0.14, 0.14, 0.12, 0.12, 0.12, 0.12, 0.12, 0.2, 0.2, 0.2, 0.2, 0.2, 0.31, 0.31, 0.31, 0.31, 0.31, 0.23, 0.23, 0.23, 0.23, 0.23, 0.16, 0.16, 0.16, 0.16, 0.16, 0.15, 0.15, 0.15, 0.15, 0.15, 0.11, 0.11, 0.11, 0.11, 0.11, 0.13, 0.13, 0.13, 0.13, 0.13, 0.17, 0.17, 0.17, 0.17, 0.17, 0.23, 0.23, 0.23, 0.23, 0.23, 0.21, 0.21, 0.21, 0.21, 0.21, 0.19, 0.19, 0.19, 0.19, 0.19, 0.16, 0.16, 0.16, 0.16, 0.16, 0.15, 0.15, 0.15, 0.15, 0.15, 0.14, 0.14, 0.14, 0.14, 0.14, 0.09, 0.09, 0.09, 0.09, 0.09, 0.25, 0.25, 0.25, 0.25, 0.25, 0.44, 0.44, 0.44, 0.44, 0.44, 0.54, 0.54, 0.54, 0.54, 0.54, 0.62, 0.62, 0.62, 0.62, 0.62, 0.6, 0.6, 0.6, 0.6, 0.6, 0.29, 0.29, 0.29, 0.29, 0.29, 0.14, 0.14, 0.14, 0.14, 0.14, 0.15, 0.15, 0.15, 0.15, 0.15, 0.12, 0.12, 0.12, 0.12, 0.12, 0.17, 0.17, 0.17, 0.17, 0.17, 0.11, 0.11, 0.11, 0.11, 0.11, 0.17, 0.17, 0.17, 0.17, 0.17, 0.31, 0.31, 0.31, 0.31, 0.31, 0.23, 0.23, 0.23, 0.23, 0.23, 0.25, 0.25, 0.25, 0.25, 0.25, 0.12, 0.12, 0.12, 0.12, 0.12, 0.12, 0.12, 0.12, 0.12, 0.12, 0.12, 0.12, 0.12, 0.12, 0.12, 0.14, 0.14, 0.14, 0.14, 0.14, 0.11, 0.11, 0.11, 0.11, 0.11, 0.17, 0.17, 0.17, 0.17]
                    
More
---
config:
    xyChart:
        titleFontSize: 12
        width: 900
        height: 600
    themeVariables:
        xyChart:
            titleColor: "#000000"
---
xychart-beta
    title "llama.cpp bench-server-baseline on Standard_NC4as_T4_v3
 duration=10m 543 iterations"
    y-axis "llamacpp:requests_processing"
    x-axis "llamacpp:requests_processing" 1715376005 --> 1715376631
    line [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 6.0, 6.0, 6.0, 6.0, 6.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 4.0, 4.0, 4.0, 8.0, 8.0, 8.0, 8.0, 8.0, 6.0, 6.0, 6.0, 6.0, 6.0, 5.0, 5.0, 5.0, 5.0, 5.0, 7.0, 7.0, 7.0, 7.0, 7.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 5.0, 5.0, 5.0, 5.0, 5.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 8.0, 8.0, 8.0, 8.0, 8.0, 7.0, 7.0, 7.0, 7.0, 7.0, 4.0, 4.0, 4.0, 4.0, 4.0, 6.0, 6.0, 6.0, 6.0, 6.0, 5.0, 5.0, 5.0, 5.0, 5.0, 7.0, 7.0, 7.0, 7.0, 7.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 3.0, 3.0, 3.0, 3.0, 3.0, 6.0, 6.0, 6.0, 6.0, 6.0, 3.0, 3.0, 3.0, 3.0, 3.0, 8.0, 8.0, 8.0, 8.0, 8.0, 2.0, 2.0, 2.0, 2.0, 2.0, 6.0, 6.0, 6.0, 6.0, 6.0, 4.0, 4.0, 4.0, 4.0, 4.0, 2.0, 2.0, 2.0, 2.0, 2.0, 7.0, 7.0, 7.0, 7.0, 7.0, 6.0, 6.0, 6.0, 6.0, 6.0, 4.0, 4.0, 4.0, 4.0, 4.0, 8.0, 8.0, 8.0, 8.0, 8.0, 5.0, 5.0, 5.0, 5.0, 5.0, 8.0, 8.0, 8.0, 8.0, 8.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 6.0, 6.0, 6.0, 6.0, 6.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 8.0, 8.0, 8.0, 8.0, 8.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 8.0, 8.0, 8.0, 8.0, 8.0, 6.0, 6.0, 6.0, 6.0, 6.0, 3.0, 3.0, 3.0, 3.0, 3.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 4.0, 4.0, 4.0, 2.0, 2.0, 2.0, 2.0, 2.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 4.0, 4.0, 4.0, 3.0, 3.0, 3.0, 3.0]