Add files via upload

This commit is contained in:
pliny
2026-03-08 12:23:58 -07:00
committed by GitHub
parent 26e1c5b13b
commit 346db3d59d
+25 -8
View File
@@ -2452,6 +2452,12 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
# Base 120s + ~0.1s per token gives headroom for slow models.
stream_timeout = max(120, 120 + int(max_tokens * 0.1))
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=stream_timeout)
# Resolve pad/eos token IDs so generate() doesn't warn or hang.
# Some tokenizers (e.g. LLaMA) have pad_token == eos_token after our
# earlier fixup — that's fine, we just need explicit IDs in gen_kwargs.
_eos_id = tokenizer.eos_token_id
_pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else _eos_id
gen_kwargs = {
**inputs,
"max_new_tokens": int(max_tokens),
@@ -2460,6 +2466,8 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
"top_p": top_p,
"repetition_penalty": float(repetition_penalty),
"streamer": streamer,
"pad_token_id": _pad_id,
"eos_token_id": _eos_id,
}
# Run generation in a thread; capture any CUDA/runtime errors so they
@@ -2468,7 +2476,8 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
def _generate_safe(**kwargs):
try:
model.generate(**kwargs)
with torch.inference_mode():
model.generate(**kwargs)
except Exception as e:
gen_error[0] = e
# Signal the streamer to stop so the main thread doesn't hang
@@ -2885,12 +2894,16 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=context_length)
_eos_id = tokenizer.eos_token_id
_pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else _eos_id
gen_kwargs_base = {
"max_new_tokens": int(max_tokens),
"do_sample": temperature > 0,
"temperature": max(temperature, 0.01),
"top_p": top_p,
"repetition_penalty": float(repetition_penalty),
"pad_token_id": _pad_id,
"eos_token_id": _eos_id,
}
# Add user message to both histories
@@ -2907,7 +2920,8 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
def _gen_abliterated(**kwargs):
try:
abliterated_model.generate(**kwargs)
with torch.inference_mode():
abliterated_model.generate(**kwargs)
except Exception as e:
gen_error_abl[0] = e
try:
@@ -2967,7 +2981,8 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
def _gen_original(**kwargs):
try:
original_model.generate(**kwargs) # noqa: F821
with torch.inference_mode():
original_model.generate(**kwargs) # noqa: F821
except Exception as e:
gen_error_orig[0] = e
try:
@@ -5167,14 +5182,17 @@ Built on the shoulders of:
)
# Wire session model auto-loading (Chat tab dropdown change)
# Always pass choices + value together so ZeroGPU doesn't hit stale choices
# NOTE: .then syncs choices ONLY (not value) to the other dropdown.
# Syncing value would create an infinite cascade: dd1.change → .then
# sets dd2 value → dd2.change → .then sets dd1 value → dd1.change …
# The obliterate/benchmark functions already set both dropdowns to the
# same value in their final yield, so no value sync is needed here.
session_model_dd.change(
fn=load_bench_into_chat,
inputs=[session_model_dd],
outputs=[session_load_status, chat_status],
).then(
fn=lambda v: (gr.update(choices=_get_session_model_choices(), value=v), _get_vram_html()),
inputs=[session_model_dd],
fn=lambda: (gr.update(choices=_get_session_model_choices()), _get_vram_html()),
outputs=[ab_session_model_dd, vram_display],
)
@@ -5184,8 +5202,7 @@ Built on the shoulders of:
inputs=[ab_session_model_dd],
outputs=[ab_session_load_status, chat_status],
).then(
fn=lambda v: (gr.update(choices=_get_session_model_choices(), value=v), _get_vram_html()),
inputs=[ab_session_model_dd],
fn=lambda: (gr.update(choices=_get_session_model_choices()), _get_vram_html()),
outputs=[session_model_dd, vram_display],
)