Add more llama config
Browse files
    	
        inference-cache-config/llama.json
    CHANGED
    
    | @@ -59,13 +59,25 @@ | |
| 59 | 
             
                  "sequence_length": 4096,
         | 
| 60 | 
             
                  "num_cores": 24,
         | 
| 61 | 
             
                  "auto_cast_type": "fp16"
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 62 | 
             
                }
         | 
| 63 | 
             
              ],
         | 
| 64 | 
             
              "meta-llama/Llama-2-13b-chat-hf": [
         | 
| 65 | 
             
                {
         | 
| 66 | 
             
                  "batch_size": 1,
         | 
| 67 | 
             
                  "sequence_length": 4096,
         | 
| 68 | 
            -
                  "num_cores":  | 
| 69 | 
             
                  "auto_cast_type": "fp16"
         | 
| 70 | 
             
                },
         | 
| 71 | 
             
                {
         | 
| @@ -77,7 +89,7 @@ | |
| 77 | 
             
                {
         | 
| 78 | 
             
                  "batch_size": 4,
         | 
| 79 | 
             
                  "sequence_length": 4096,
         | 
| 80 | 
            -
                  "num_cores":  | 
| 81 | 
             
                  "auto_cast_type": "fp16"
         | 
| 82 | 
             
                },
         | 
| 83 | 
             
                {
         | 
| @@ -89,7 +101,7 @@ | |
| 89 | 
             
                {
         | 
| 90 | 
             
                  "batch_size": 8,
         | 
| 91 | 
             
                  "sequence_length": 4096,
         | 
| 92 | 
            -
                  "num_cores":  | 
| 93 | 
             
                  "auto_cast_type": "fp16"
         | 
| 94 | 
             
                },
         | 
| 95 | 
             
                {
         | 
| @@ -97,6 +109,30 @@ | |
| 97 | 
             
                  "sequence_length": 4096,
         | 
| 98 | 
             
                  "num_cores": 24,
         | 
| 99 | 
             
                  "auto_cast_type": "fp16"
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 100 | 
             
                }
         | 
| 101 | 
             
              ],
         | 
| 102 | 
             
              "meta-llama/Llama-2-70b-chat-hf": [
         | 
|  | |
| 59 | 
             
                  "sequence_length": 4096,
         | 
| 60 | 
             
                  "num_cores": 24,
         | 
| 61 | 
             
                  "auto_cast_type": "fp16"
         | 
| 62 | 
            +
                },
         | 
| 63 | 
            +
                {
         | 
| 64 | 
            +
                  "batch_size": 32,
         | 
| 65 | 
            +
                  "sequence_length": 4096,
         | 
| 66 | 
            +
                  "num_cores": 8,
         | 
| 67 | 
            +
                  "auto_cast_type": "fp16"
         | 
| 68 | 
            +
                },
         | 
| 69 | 
            +
                {
         | 
| 70 | 
            +
                  "batch_size": 32,
         | 
| 71 | 
            +
                  "sequence_length": 4096,
         | 
| 72 | 
            +
                  "num_cores": 24,
         | 
| 73 | 
            +
                  "auto_cast_type": "fp16"
         | 
| 74 | 
             
                }
         | 
| 75 | 
             
              ],
         | 
| 76 | 
             
              "meta-llama/Llama-2-13b-chat-hf": [
         | 
| 77 | 
             
                {
         | 
| 78 | 
             
                  "batch_size": 1,
         | 
| 79 | 
             
                  "sequence_length": 4096,
         | 
| 80 | 
            +
                  "num_cores": 12,
         | 
| 81 | 
             
                  "auto_cast_type": "fp16"
         | 
| 82 | 
             
                },
         | 
| 83 | 
             
                {
         | 
|  | |
| 89 | 
             
                {
         | 
| 90 | 
             
                  "batch_size": 4,
         | 
| 91 | 
             
                  "sequence_length": 4096,
         | 
| 92 | 
            +
                  "num_cores": 12,
         | 
| 93 | 
             
                  "auto_cast_type": "fp16"
         | 
| 94 | 
             
                },
         | 
| 95 | 
             
                {
         | 
|  | |
| 101 | 
             
                {
         | 
| 102 | 
             
                  "batch_size": 8,
         | 
| 103 | 
             
                  "sequence_length": 4096,
         | 
| 104 | 
            +
                  "num_cores": 12,
         | 
| 105 | 
             
                  "auto_cast_type": "fp16"
         | 
| 106 | 
             
                },
         | 
| 107 | 
             
                {
         | 
|  | |
| 109 | 
             
                  "sequence_length": 4096,
         | 
| 110 | 
             
                  "num_cores": 24,
         | 
| 111 | 
             
                  "auto_cast_type": "fp16"
         | 
| 112 | 
            +
                },
         | 
| 113 | 
            +
                {
         | 
| 114 | 
            +
                  "batch_size": 16,
         | 
| 115 | 
            +
                  "sequence_length": 4096,
         | 
| 116 | 
            +
                  "num_cores": 12,
         | 
| 117 | 
            +
                  "auto_cast_type": "fp16"
         | 
| 118 | 
            +
                },
         | 
| 119 | 
            +
                {
         | 
| 120 | 
            +
                  "batch_size": 16,
         | 
| 121 | 
            +
                  "sequence_length": 4096,
         | 
| 122 | 
            +
                  "num_cores": 24,
         | 
| 123 | 
            +
                  "auto_cast_type": "fp16"
         | 
| 124 | 
            +
                },
         | 
| 125 | 
            +
                {
         | 
| 126 | 
            +
                  "batch_size": 32,
         | 
| 127 | 
            +
                  "sequence_length": 4096,
         | 
| 128 | 
            +
                  "num_cores": 12,
         | 
| 129 | 
            +
                  "auto_cast_type": "fp16"
         | 
| 130 | 
            +
                },
         | 
| 131 | 
            +
                {
         | 
| 132 | 
            +
                  "batch_size": 32,
         | 
| 133 | 
            +
                  "sequence_length": 4096,
         | 
| 134 | 
            +
                  "num_cores": 24,
         | 
| 135 | 
            +
                  "auto_cast_type": "fp16"
         | 
| 136 | 
             
                }
         | 
| 137 | 
             
              ],
         | 
| 138 | 
             
              "meta-llama/Llama-2-70b-chat-hf": [
         | 

