Add files via upload

2026-07-24 12:50:54 +02:00 · 2026-03-08 12:23:58 -07:00
parent 26e1c5b13b
commit 346db3d59d
1 changed files with 25 additions and 8 deletions
@@ -2452,6 +2452,12 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
    # Base 120s + ~0.1s per token gives headroom for slow models.
    stream_timeout = max(120, 120 + int(max_tokens * 0.1))
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=stream_timeout)
+
+    # Resolve pad/eos token IDs so generate() doesn't warn or hang.
+    # Some tokenizers (e.g. LLaMA) have pad_token == eos_token after our
+    # earlier fixup — that's fine, we just need explicit IDs in gen_kwargs.
+    _eos_id = tokenizer.eos_token_id
+    _pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else _eos_id
    gen_kwargs = {
        **inputs,
        "max_new_tokens": int(max_tokens),
@@ -2460,6 +2466,8 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
        "top_p": top_p,
        "repetition_penalty": float(repetition_penalty),
        "streamer": streamer,
+        "pad_token_id": _pad_id,
+        "eos_token_id": _eos_id,
    }

    # Run generation in a thread; capture any CUDA/runtime errors so they
@@ -2468,7 +2476,8 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,

    def _generate_safe(**kwargs):
        try:
-            model.generate(**kwargs)
+            with torch.inference_mode():
+                model.generate(**kwargs)
        except Exception as e:
            gen_error[0] = e
            # Signal the streamer to stop so the main thread doesn't hang
@@ -2885,12 +2894,16 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[

    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=context_length)

+    _eos_id = tokenizer.eos_token_id
+    _pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else _eos_id
    gen_kwargs_base = {
        "max_new_tokens": int(max_tokens),
        "do_sample": temperature > 0,
        "temperature": max(temperature, 0.01),
        "top_p": top_p,
        "repetition_penalty": float(repetition_penalty),
+        "pad_token_id": _pad_id,
+        "eos_token_id": _eos_id,
    }

    # Add user message to both histories
@@ -2907,7 +2920,8 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[

    def _gen_abliterated(**kwargs):
        try:
-            abliterated_model.generate(**kwargs)
+            with torch.inference_mode():
+                abliterated_model.generate(**kwargs)
        except Exception as e:
            gen_error_abl[0] = e
            try:
@@ -2967,7 +2981,8 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[

        def _gen_original(**kwargs):
            try:
-                original_model.generate(**kwargs)  # noqa: F821
+                with torch.inference_mode():
+                    original_model.generate(**kwargs)  # noqa: F821
            except Exception as e:
                gen_error_orig[0] = e
                try:
@@ -5167,14 +5182,17 @@ Built on the shoulders of:
    )

    # Wire session model auto-loading (Chat tab dropdown change)
-    # Always pass choices + value together so ZeroGPU doesn't hit stale choices
+    # NOTE: .then syncs choices ONLY (not value) to the other dropdown.
+    # Syncing value would create an infinite cascade: dd1.change → .then
+    # sets dd2 value → dd2.change → .then sets dd1 value → dd1.change …
+    # The obliterate/benchmark functions already set both dropdowns to the
+    # same value in their final yield, so no value sync is needed here.
    session_model_dd.change(
        fn=load_bench_into_chat,
        inputs=[session_model_dd],
        outputs=[session_load_status, chat_status],
    ).then(
-        fn=lambda v: (gr.update(choices=_get_session_model_choices(), value=v), _get_vram_html()),
-        inputs=[session_model_dd],
+        fn=lambda: (gr.update(choices=_get_session_model_choices()), _get_vram_html()),
        outputs=[ab_session_model_dd, vram_display],
    )

@@ -5184,8 +5202,7 @@ Built on the shoulders of:
        inputs=[ab_session_model_dd],
        outputs=[ab_session_load_status, chat_status],
    ).then(
-        fn=lambda v: (gr.update(choices=_get_session_model_choices(), value=v), _get_vram_html()),
-        inputs=[ab_session_model_dd],
+        fn=lambda: (gr.update(choices=_get_session_model_choices()), _get_vram_html()),
        outputs=[session_model_dd, vram_display],
    )