From 346db3d59d1c7a0b683e97262891c6bda020be72 Mon Sep 17 00:00:00 2001 From: pliny <133052465+elder-plinius@users.noreply.github.com> Date: Sun, 8 Mar 2026 12:23:58 -0700 Subject: [PATCH] Add files via upload --- app.py | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/app.py b/app.py index 28f2d07..f0f7ee1 100644 --- a/app.py +++ b/app.py @@ -2452,6 +2452,12 @@ def chat_respond(message: str, history: list[dict], system_prompt: str, # Base 120s + ~0.1s per token gives headroom for slow models. stream_timeout = max(120, 120 + int(max_tokens * 0.1)) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=stream_timeout) + + # Resolve pad/eos token IDs so generate() doesn't warn or hang. + # Some tokenizers (e.g. LLaMA) have pad_token == eos_token after our + # earlier fixup — that's fine, we just need explicit IDs in gen_kwargs. + _eos_id = tokenizer.eos_token_id + _pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else _eos_id gen_kwargs = { **inputs, "max_new_tokens": int(max_tokens), @@ -2460,6 +2466,8 @@ def chat_respond(message: str, history: list[dict], system_prompt: str, "top_p": top_p, "repetition_penalty": float(repetition_penalty), "streamer": streamer, + "pad_token_id": _pad_id, + "eos_token_id": _eos_id, } # Run generation in a thread; capture any CUDA/runtime errors so they @@ -2468,7 +2476,8 @@ def chat_respond(message: str, history: list[dict], system_prompt: str, def _generate_safe(**kwargs): try: - model.generate(**kwargs) + with torch.inference_mode(): + model.generate(**kwargs) except Exception as e: gen_error[0] = e # Signal the streamer to stop so the main thread doesn't hang @@ -2885,12 +2894,16 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=context_length) + _eos_id = tokenizer.eos_token_id + _pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else _eos_id gen_kwargs_base = { "max_new_tokens": int(max_tokens), "do_sample": temperature > 0, "temperature": max(temperature, 0.01), "top_p": top_p, "repetition_penalty": float(repetition_penalty), + "pad_token_id": _pad_id, + "eos_token_id": _eos_id, } # Add user message to both histories @@ -2907,7 +2920,8 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[ def _gen_abliterated(**kwargs): try: - abliterated_model.generate(**kwargs) + with torch.inference_mode(): + abliterated_model.generate(**kwargs) except Exception as e: gen_error_abl[0] = e try: @@ -2967,7 +2981,8 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[ def _gen_original(**kwargs): try: - original_model.generate(**kwargs) # noqa: F821 + with torch.inference_mode(): + original_model.generate(**kwargs) # noqa: F821 except Exception as e: gen_error_orig[0] = e try: @@ -5167,14 +5182,17 @@ Built on the shoulders of: ) # Wire session model auto-loading (Chat tab dropdown change) - # Always pass choices + value together so ZeroGPU doesn't hit stale choices + # NOTE: .then syncs choices ONLY (not value) to the other dropdown. + # Syncing value would create an infinite cascade: dd1.change → .then + # sets dd2 value → dd2.change → .then sets dd1 value → dd1.change … + # The obliterate/benchmark functions already set both dropdowns to the + # same value in their final yield, so no value sync is needed here. session_model_dd.change( fn=load_bench_into_chat, inputs=[session_model_dd], outputs=[session_load_status, chat_status], ).then( - fn=lambda v: (gr.update(choices=_get_session_model_choices(), value=v), _get_vram_html()), - inputs=[session_model_dd], + fn=lambda: (gr.update(choices=_get_session_model_choices()), _get_vram_html()), outputs=[ab_session_model_dd, vram_display], ) @@ -5184,8 +5202,7 @@ Built on the shoulders of: inputs=[ab_session_model_dd], outputs=[ab_session_load_status, chat_status], ).then( - fn=lambda v: (gr.update(choices=_get_session_model_choices(), value=v), _get_vram_html()), - inputs=[ab_session_model_dd], + fn=lambda: (gr.update(choices=_get_session_model_choices()), _get_vram_html()), outputs=[session_model_dd, vram_display], )