11 月之前 · 68dfc6236a
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -19,17 +19,19 @@ func TestMultiModelConcurrency(t *testing.T) {
 
				 	var (
			
 
				 		req = [2]api.GenerateRequest{
			
 
				 			{
			
 
				-				Model:  "orca-mini",
			
 
				-				Prompt: "why is the ocean blue?",
			
 
				-				Stream: &stream,
			
 
				+				Model:     "orca-mini",
			
 
				+				Prompt:    "why is the ocean blue?",
			
 
				+				Stream:    &stream,
			
 
				+				KeepAlive: &api.Duration{Duration: 10 * time.Second},
			
 
				 				Options: map[string]interface{}{
			
 
				 					"seed":        42,
			
 
				 					"temperature": 0.0,
			
 
				 				},
			
 
				 			}, {
			
 
				-				Model:  "tinydolphin",
			
 
				-				Prompt: "what is the origin of the us thanksgiving holiday?",
			
 
				-				Stream: &stream,
			
 
				+				Model:     "tinydolphin",
			
 
				+				Prompt:    "what is the origin of the us thanksgiving holiday?",
			
 
				+				Stream:    &stream,
			
 
				+				KeepAlive: &api.Duration{Duration: 10 * time.Second},
			
 
				 				Options: map[string]interface{}{
			
 
				 					"seed":        42,
			
 
				 					"temperature": 0.0,
			
@@ -43,7 +45,7 @@ func TestMultiModelConcurrency(t *testing.T) {
 
				 	)
			
 
				 	var wg sync.WaitGroup
			
 
				 	wg.Add(len(req))
			
 
				-	ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
			
 
				+	ctx, cancel := context.WithTimeout(context.Background(), time.Second*240)
			
 
				 	defer cancel()
			
 
				 
			
 
				 	client, _, cleanup := InitServerConnection(ctx, t)
			
@@ -56,32 +58,46 @@ func TestMultiModelConcurrency(t *testing.T) {
 
				 	for i := 0; i < len(req); i++ {
			
 
				 		go func(i int) {
			
 
				 			defer wg.Done()
			
 
				-			DoGenerate(ctx, t, client, req[i], resp[i], 30*time.Second, 10*time.Second)
			
 
				+			DoGenerate(ctx, t, client, req[i], resp[i], 60*time.Second, 10*time.Second)
			
 
				 		}(i)
			
 
				 	}
			
 
				 	wg.Wait()
			
 
				 }
			
 
				 
			
 
				 func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
			
 
				-	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) // GTX 750 2G card takes ~9 minutes
			
 
				+	req, resp := GenerateRequests()
			
 
				+	reqLimit := len(req)
			
 
				+	iterLimit := 5
			
 
				+
			
 
				+	vram := os.Getenv("OLLAMA_MAX_VRAM")
			
 
				+	if vram != "" {
			
 
				+		max, err := strconv.ParseUint(vram, 10, 64)
			
 
				+		require.NoError(t, err)
			
 
				+		// Don't hammer on small VRAM cards...
			
 
				+		if max < 4*1024*1024*1024 {
			
 
				+			reqLimit = min(reqLimit, 2)
			
 
				+			iterLimit = 2
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	ctx, cancel := context.WithTimeout(context.Background(), 9*time.Minute)
			
 
				 	defer cancel()
			
 
				 	client, _, cleanup := InitServerConnection(ctx, t)
			
 
				 	defer cleanup()
			
 
				 
			
 
				-	req, resp := GenerateRequests()
			
 
				 	// Get the server running (if applicable) warm the model up with a single initial request
			
 
				-	DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 5*time.Second)
			
 
				+	DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 10*time.Second)
			
 
				 
			
 
				 	var wg sync.WaitGroup
			
 
				-	wg.Add(len(req))
			
 
				-	for i := 0; i < len(req); i++ {
			
 
				+	wg.Add(reqLimit)
			
 
				+	for i := 0; i < reqLimit; i++ {
			
 
				 		go func(i int) {
			
 
				 			defer wg.Done()
			
 
				-			for j := 0; j < 5; j++ {
			
 
				+			for j := 0; j < iterLimit; j++ {
			
 
				 				slog.Info("Starting", "req", i, "iter", j)
			
 
				-				// On slower GPUs it can take a while to process the 4 concurrent requests
			
 
				+				// On slower GPUs it can take a while to process the concurrent requests
			
 
				 				// so we allow a much longer initial timeout
			
 
				-				DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 5*time.Second)
			
 
				+				DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 20*time.Second)
			
 
				 			}
			
 
				 		}(i)
			
 
				 	}
			
--- a/integration/context_test.go
+++ b/integration/context_test.go
@@ -11,7 +11,7 @@ import (
 
				 )
			
 
				 
			
 
				 func TestContextExhaustion(t *testing.T) {
			
 
				-	ctx, cancel := context.WithTimeout(context.Background(), 4*time.Minute) // Longer needed for small footprint GPUs
			
 
				+	ctx, cancel := context.WithTimeout(context.Background(), 6*time.Minute) // Longer needed for small footprint GPUs
			
 
				 	defer cancel()
			
 
				 	// Set up the test data
			
 
				 	req := api.GenerateRequest{
			
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@@ -32,7 +32,11 @@ func TestIntegrationMultimodal(t *testing.T) {
 
				 	resp := "the ollam"
			
 
				 	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
			
 
				 	defer cancel()
			
 
				-	GenerateTestHelper(ctx, t, req, []string{resp})
			
 
				+	client, _, cleanup := InitServerConnection(ctx, t)
			
 
				+	defer cleanup()
			
 
				+	require.NoError(t, PullIfMissing(ctx, client, req.Model))
			
 
				+	// llava models on CPU can be quite slow to start,
			
 
				+	DoGenerate(ctx, t, client, req, []string{resp}, 120*time.Second, 30*time.Second)
			
 
				 }
			
 
				 
			
 
				 const imageEncoding = `iVBORw0KGgoAAAANSUhEUgAAANIAAAB4CAYAAACHHqzKAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEb
			
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -140,7 +140,7 @@ func PullIfMissing(ctx context.Context, client *api.Client, modelName string) er
 
				 
			
 
				 	showCtx, cancel := context.WithDeadlineCause(
			
 
				 		ctx,
			
 
				-		time.Now().Add(5*time.Second),
			
 
				+		time.Now().Add(10*time.Second),
			
 
				 		fmt.Errorf("show for existing model %s took too long", modelName),
			
 
				 	)
			
 
				 	defer cancel()
			
@@ -287,41 +287,46 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
 
				 func GenerateRequests() ([]api.GenerateRequest, [][]string) {
			
 
				 	return []api.GenerateRequest{
			
 
				 			{
			
 
				-				Model:  "orca-mini",
			
 
				-				Prompt: "why is the ocean blue?",
			
 
				-				Stream: &stream,
			
 
				+				Model:     "orca-mini",
			
 
				+				Prompt:    "why is the ocean blue?",
			
 
				+				Stream:    &stream,
			
 
				+				KeepAlive: &api.Duration{Duration: 10 * time.Second},
			
 
				 				Options: map[string]interface{}{
			
 
				 					"seed":        42,
			
 
				 					"temperature": 0.0,
			
 
				 				},
			
 
				 			}, {
			
 
				-				Model:  "orca-mini",
			
 
				-				Prompt: "why is the color of dirt brown?",
			
 
				-				Stream: &stream,
			
 
				+				Model:     "orca-mini",
			
 
				+				Prompt:    "why is the color of dirt brown?",
			
 
				+				Stream:    &stream,
			
 
				+				KeepAlive: &api.Duration{Duration: 10 * time.Second},
			
 
				 				Options: map[string]interface{}{
			
 
				 					"seed":        42,
			
 
				 					"temperature": 0.0,
			
 
				 				},
			
 
				 			}, {
			
 
				-				Model:  "orca-mini",
			
 
				-				Prompt: "what is the origin of the us thanksgiving holiday?",
			
 
				-				Stream: &stream,
			
 
				+				Model:     "orca-mini",
			
 
				+				Prompt:    "what is the origin of the us thanksgiving holiday?",
			
 
				+				Stream:    &stream,
			
 
				+				KeepAlive: &api.Duration{Duration: 10 * time.Second},
			
 
				 				Options: map[string]interface{}{
			
 
				 					"seed":        42,
			
 
				 					"temperature": 0.0,
			
 
				 				},
			
 
				 			}, {
			
 
				-				Model:  "orca-mini",
			
 
				-				Prompt: "what is the origin of independence day?",
			
 
				-				Stream: &stream,
			
 
				+				Model:     "orca-mini",
			
 
				+				Prompt:    "what is the origin of independence day?",
			
 
				+				Stream:    &stream,
			
 
				+				KeepAlive: &api.Duration{Duration: 10 * time.Second},
			
 
				 				Options: map[string]interface{}{
			
 
				 					"seed":        42,
			
 
				 					"temperature": 0.0,
			
 
				 				},
			
 
				 			}, {
			
 
				-				Model:  "orca-mini",
			
 
				-				Prompt: "what is the composition of air?",
			
 
				-				Stream: &stream,
			
 
				+				Model:     "orca-mini",
			
 
				+				Prompt:    "what is the composition of air?",
			
 
				+				Stream:    &stream,
			
 
				+				KeepAlive: &api.Duration{Duration: 10 * time.Second},
			
 
				 				Options: map[string]interface{}{
			
 
				 					"seed":        42,
			
 
				 					"temperature": 0.0,