|
@@ -424,6 +424,32 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
|
|
4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
|
|
|
4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
|
|
|
)
|
|
|
+ case "chatglm":
|
|
|
+ fullOffload = 4 * batch * (embedding + vocab)
|
|
|
+ partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
|
|
|
+ if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
|
|
|
+ fullOffload = max(
|
|
|
+ fullOffload,
|
|
|
+ 4*batch*(2+
|
|
|
+ 2*embedding+
|
|
|
+ context+
|
|
|
+ context*heads+
|
|
|
+ embeddingHeadsK*heads+
|
|
|
+ qkvBias.Shape[0]),
|
|
|
+ )
|
|
|
+
|
|
|
+ partialOffload = max(
|
|
|
+ partialOffload,
|
|
|
+ 4*batch*(1+
|
|
|
+ 2*embedding+
|
|
|
+ embeddingHeadsK*heads+
|
|
|
+ context+
|
|
|
+ context*heads)+
|
|
|
+ 4*embeddingHeadsK*context+
|
|
|
+ 4*context*embeddingHeadsK+
|
|
|
+ 4*qkvBias.Shape[0],
|
|
|
+ )
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
return
|