mirror of
https://github.com/elder-plinius/OBLITERATUS.git
synced 2026-06-07 14:53:53 +02:00
Add files via upload
This commit is contained in:
@@ -2452,6 +2452,12 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
|
||||
# Base 120s + ~0.1s per token gives headroom for slow models.
|
||||
stream_timeout = max(120, 120 + int(max_tokens * 0.1))
|
||||
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=stream_timeout)
|
||||
|
||||
# Resolve pad/eos token IDs so generate() doesn't warn or hang.
|
||||
# Some tokenizers (e.g. LLaMA) have pad_token == eos_token after our
|
||||
# earlier fixup — that's fine, we just need explicit IDs in gen_kwargs.
|
||||
_eos_id = tokenizer.eos_token_id
|
||||
_pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else _eos_id
|
||||
gen_kwargs = {
|
||||
**inputs,
|
||||
"max_new_tokens": int(max_tokens),
|
||||
@@ -2460,6 +2466,8 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
|
||||
"top_p": top_p,
|
||||
"repetition_penalty": float(repetition_penalty),
|
||||
"streamer": streamer,
|
||||
"pad_token_id": _pad_id,
|
||||
"eos_token_id": _eos_id,
|
||||
}
|
||||
|
||||
# Run generation in a thread; capture any CUDA/runtime errors so they
|
||||
@@ -2468,7 +2476,8 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
|
||||
|
||||
def _generate_safe(**kwargs):
|
||||
try:
|
||||
model.generate(**kwargs)
|
||||
with torch.inference_mode():
|
||||
model.generate(**kwargs)
|
||||
except Exception as e:
|
||||
gen_error[0] = e
|
||||
# Signal the streamer to stop so the main thread doesn't hang
|
||||
@@ -2885,12 +2894,16 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
|
||||
|
||||
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=context_length)
|
||||
|
||||
_eos_id = tokenizer.eos_token_id
|
||||
_pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else _eos_id
|
||||
gen_kwargs_base = {
|
||||
"max_new_tokens": int(max_tokens),
|
||||
"do_sample": temperature > 0,
|
||||
"temperature": max(temperature, 0.01),
|
||||
"top_p": top_p,
|
||||
"repetition_penalty": float(repetition_penalty),
|
||||
"pad_token_id": _pad_id,
|
||||
"eos_token_id": _eos_id,
|
||||
}
|
||||
|
||||
# Add user message to both histories
|
||||
@@ -2907,7 +2920,8 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
|
||||
|
||||
def _gen_abliterated(**kwargs):
|
||||
try:
|
||||
abliterated_model.generate(**kwargs)
|
||||
with torch.inference_mode():
|
||||
abliterated_model.generate(**kwargs)
|
||||
except Exception as e:
|
||||
gen_error_abl[0] = e
|
||||
try:
|
||||
@@ -2967,7 +2981,8 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
|
||||
|
||||
def _gen_original(**kwargs):
|
||||
try:
|
||||
original_model.generate(**kwargs) # noqa: F821
|
||||
with torch.inference_mode():
|
||||
original_model.generate(**kwargs) # noqa: F821
|
||||
except Exception as e:
|
||||
gen_error_orig[0] = e
|
||||
try:
|
||||
@@ -5167,14 +5182,17 @@ Built on the shoulders of:
|
||||
)
|
||||
|
||||
# Wire session model auto-loading (Chat tab dropdown change)
|
||||
# Always pass choices + value together so ZeroGPU doesn't hit stale choices
|
||||
# NOTE: .then syncs choices ONLY (not value) to the other dropdown.
|
||||
# Syncing value would create an infinite cascade: dd1.change → .then
|
||||
# sets dd2 value → dd2.change → .then sets dd1 value → dd1.change …
|
||||
# The obliterate/benchmark functions already set both dropdowns to the
|
||||
# same value in their final yield, so no value sync is needed here.
|
||||
session_model_dd.change(
|
||||
fn=load_bench_into_chat,
|
||||
inputs=[session_model_dd],
|
||||
outputs=[session_load_status, chat_status],
|
||||
).then(
|
||||
fn=lambda v: (gr.update(choices=_get_session_model_choices(), value=v), _get_vram_html()),
|
||||
inputs=[session_model_dd],
|
||||
fn=lambda: (gr.update(choices=_get_session_model_choices()), _get_vram_html()),
|
||||
outputs=[ab_session_model_dd, vram_display],
|
||||
)
|
||||
|
||||
@@ -5184,8 +5202,7 @@ Built on the shoulders of:
|
||||
inputs=[ab_session_model_dd],
|
||||
outputs=[ab_session_load_status, chat_status],
|
||||
).then(
|
||||
fn=lambda v: (gr.update(choices=_get_session_model_choices(), value=v), _get_vram_html()),
|
||||
inputs=[ab_session_model_dd],
|
||||
fn=lambda: (gr.update(choices=_get_session_model_choices()), _get_vram_html()),
|
||||
outputs=[session_model_dd, vram_display],
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user