From 346db3d59d1c7a0b683e97262891c6bda020be72 Mon Sep 17 00:00:00 2001
From: pliny <133052465+elder-plinius@users.noreply.github.com>
Date: Sun, 8 Mar 2026 12:23:58 -0700
Subject: [PATCH] Add files via upload

---
 app.py | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/app.py b/app.py
index 28f2d07..f0f7ee1 100644
--- a/app.py
+++ b/app.py
@@ -2452,6 +2452,12 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
     # Base 120s + ~0.1s per token gives headroom for slow models.
     stream_timeout = max(120, 120 + int(max_tokens * 0.1))
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=stream_timeout)
+
+    # Resolve pad/eos token IDs so generate() doesn't warn or hang.
+    # Some tokenizers (e.g. LLaMA) have pad_token == eos_token after our
+    # earlier fixup — that's fine, we just need explicit IDs in gen_kwargs.
+    _eos_id = tokenizer.eos_token_id
+    _pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else _eos_id
     gen_kwargs = {
         **inputs,
         "max_new_tokens": int(max_tokens),
@@ -2460,6 +2466,8 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
         "top_p": top_p,
         "repetition_penalty": float(repetition_penalty),
         "streamer": streamer,
+        "pad_token_id": _pad_id,
+        "eos_token_id": _eos_id,
     }
 
     # Run generation in a thread; capture any CUDA/runtime errors so they
@@ -2468,7 +2476,8 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
 
     def _generate_safe(**kwargs):
         try:
-            model.generate(**kwargs)
+            with torch.inference_mode():
+                model.generate(**kwargs)
         except Exception as e:
             gen_error[0] = e
             # Signal the streamer to stop so the main thread doesn't hang
@@ -2885,12 +2894,16 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
 
     inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=context_length)
 
+    _eos_id = tokenizer.eos_token_id
+    _pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else _eos_id
     gen_kwargs_base = {
         "max_new_tokens": int(max_tokens),
         "do_sample": temperature > 0,
         "temperature": max(temperature, 0.01),
         "top_p": top_p,
         "repetition_penalty": float(repetition_penalty),
+        "pad_token_id": _pad_id,
+        "eos_token_id": _eos_id,
     }
 
     # Add user message to both histories
@@ -2907,7 +2920,8 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
 
     def _gen_abliterated(**kwargs):
         try:
-            abliterated_model.generate(**kwargs)
+            with torch.inference_mode():
+                abliterated_model.generate(**kwargs)
         except Exception as e:
             gen_error_abl[0] = e
             try:
@@ -2967,7 +2981,8 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
 
         def _gen_original(**kwargs):
             try:
-                original_model.generate(**kwargs)  # noqa: F821
+                with torch.inference_mode():
+                    original_model.generate(**kwargs)  # noqa: F821
             except Exception as e:
                 gen_error_orig[0] = e
                 try:
@@ -5167,14 +5182,17 @@ Built on the shoulders of:
     )
 
     # Wire session model auto-loading (Chat tab dropdown change)
-    # Always pass choices + value together so ZeroGPU doesn't hit stale choices
+    # NOTE: .then syncs choices ONLY (not value) to the other dropdown.
+    # Syncing value would create an infinite cascade: dd1.change → .then
+    # sets dd2 value → dd2.change → .then sets dd1 value → dd1.change …
+    # The obliterate/benchmark functions already set both dropdowns to the
+    # same value in their final yield, so no value sync is needed here.
     session_model_dd.change(
         fn=load_bench_into_chat,
         inputs=[session_model_dd],
         outputs=[session_load_status, chat_status],
     ).then(
-        fn=lambda v: (gr.update(choices=_get_session_model_choices(), value=v), _get_vram_html()),
-        inputs=[session_model_dd],
+        fn=lambda: (gr.update(choices=_get_session_model_choices()), _get_vram_html()),
         outputs=[ab_session_model_dd, vram_display],
     )
 
@@ -5184,8 +5202,7 @@ Built on the shoulders of:
         inputs=[ab_session_model_dd],
         outputs=[ab_session_load_status, chat_status],
     ).then(
-        fn=lambda v: (gr.update(choices=_get_session_model_choices(), value=v), _get_vram_html()),
-        inputs=[ab_session_model_dd],
+        fn=lambda: (gr.update(choices=_get_session_model_choices()), _get_vram_html()),
         outputs=[session_model_dd, vram_display],
     )