Browse Source

server.cpp: cleanup cross attention state

jmorganca 7 months ago
parent
commit
71e76f8c90
1 changed files with 7 additions and 5 deletions
  1. 7 5
      llm/ext_server/server.cpp

+ 7 - 5
llm/ext_server/server.cpp

@@ -729,6 +729,10 @@ struct llama_server_context
             slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
         }
 
+        // Check for mllama architecture, which processes images differently than llava
+        char arch_str[256];
+        llama_model_meta_val_str(model, "general.architecture", arch_str, 256);
+        bool is_mllama = strcmp(arch_str, "mllama") == 0;
         if (multimodal)
         {
             const auto &images_data = data.find("image_data");
@@ -738,11 +742,6 @@ struct llama_server_context
                 {
                     const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
 
-                    // Check for mllama architecture, which processes images differently than llava
-                    char arch_str[256];
-                    llama_model_meta_val_str(model, "general.architecture", arch_str, 256);
-                    bool is_mllama = strcmp(arch_str, "mllama") == 0;
-
                     if (is_mllama) {
                         LOG_INFO("MLLAMA architecture detected, processing first image", {{"slot_id", slot->id}});
 
@@ -820,6 +819,8 @@ struct llama_server_context
                     slot->params.input_suffix = prompt.substr(begin_prefix);
                     slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
                 }
+            } else {
+                llama_set_cross_attn_state(ctx, nullptr);
             }
         }
 
@@ -1496,6 +1497,7 @@ struct llama_server_context
                 {
                     if (slot.task_id == task.target_id)
                     {
+                        slot.reset();
                         slot.release();
                         break;
                     }