Skip to content

Commit

Permalink
Merge pull request oobabooga#5530 from oobabooga/dev
Browse files Browse the repository at this point in the history
Merge dev branch
  • Loading branch information
oobabooga authored Feb 17, 2024
2 parents 771c592 + af0bbf5 commit dd46229
Show file tree
Hide file tree
Showing 22 changed files with 310 additions and 182 deletions.
35 changes: 22 additions & 13 deletions download-model.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,16 @@

class ModelDownloader:
def __init__(self, max_retries=5):
self.session = requests.Session()
if max_retries:
self.session.mount('https://cdn-lfs.huggingface.co', HTTPAdapter(max_retries=max_retries))
self.session.mount('https://huggingface.co', HTTPAdapter(max_retries=max_retries))
self.max_retries = max_retries

def get_session(self):
session = requests.Session()
if self.max_retries:
session.mount('https://cdn-lfs.huggingface.co', HTTPAdapter(max_retries=self.max_retries))
session.mount('https://huggingface.co', HTTPAdapter(max_retries=self.max_retries))

if os.getenv('HF_USER') is not None and os.getenv('HF_PASS') is not None:
self.session.auth = (os.getenv('HF_USER'), os.getenv('HF_PASS'))
session.auth = (os.getenv('HF_USER'), os.getenv('HF_PASS'))

try:
from huggingface_hub import get_token
Expand All @@ -41,7 +44,9 @@ def __init__(self, max_retries=5):
token = os.getenv("HF_TOKEN")

if token is not None:
self.session.headers = {'authorization': f'Bearer {token}'}
session.headers = {'authorization': f'Bearer {token}'}

return session

def sanitize_model_and_branch_names(self, model, branch):
if model[-1] == '/':
Expand All @@ -65,6 +70,7 @@ def sanitize_model_and_branch_names(self, model, branch):
return model, branch

def get_download_links_from_huggingface(self, model, branch, text_only=False, specific_file=None):
session = self.get_session()
page = f"/api/models/{model}/tree/{branch}"
cursor = b""

Expand All @@ -78,7 +84,7 @@ def get_download_links_from_huggingface(self, model, branch, text_only=False, sp
is_lora = False
while True:
url = f"{base}{page}" + (f"?cursor={cursor.decode()}" if cursor else "")
r = self.session.get(url, timeout=10)
r = session.get(url, timeout=10)
r.raise_for_status()
content = r.content

Expand Down Expand Up @@ -156,9 +162,8 @@ def get_download_links_from_huggingface(self, model, branch, text_only=False, sp
is_llamacpp = has_gguf and specific_file is not None
return links, sha256, is_lora, is_llamacpp

def get_output_folder(self, model, branch, is_lora, is_llamacpp=False, base_folder=None):
if base_folder is None:
base_folder = 'models' if not is_lora else 'loras'
def get_output_folder(self, model, branch, is_lora, is_llamacpp=False):
base_folder = 'models' if not is_lora else 'loras'

# If the model is of type GGUF, save directly in the base_folder
if is_llamacpp:
Expand All @@ -172,14 +177,15 @@ def get_output_folder(self, model, branch, is_lora, is_llamacpp=False, base_fold
return output_folder

def get_single_file(self, url, output_folder, start_from_scratch=False):
session = self.get_session()
filename = Path(url.rsplit('/', 1)[1])
output_path = output_folder / filename
headers = {}
mode = 'wb'
if output_path.exists() and not start_from_scratch:

# Check if the file has already been downloaded completely
r = self.session.get(url, stream=True, timeout=10)
r = session.get(url, stream=True, timeout=10)
total_size = int(r.headers.get('content-length', 0))
if output_path.stat().st_size >= total_size:
return
Expand All @@ -188,7 +194,7 @@ def get_single_file(self, url, output_folder, start_from_scratch=False):
headers = {'Range': f'bytes={output_path.stat().st_size}-'}
mode = 'ab'

with self.session.get(url, stream=True, headers=headers, timeout=10) as r:
with session.get(url, stream=True, headers=headers, timeout=10) as r:
r.raise_for_status() # Do not continue the download if the request was unsuccessful
total_size = int(r.headers.get('content-length', 0))
block_size = 1024 * 1024 # 1MB
Expand Down Expand Up @@ -303,7 +309,10 @@ def check_model_files(self, model, branch, links, sha256, output_folder):
links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=args.text_only, specific_file=specific_file)

# Get the output folder
output_folder = downloader.get_output_folder(model, branch, is_lora, is_llamacpp=is_llamacpp, base_folder=args.output)
if args.output:
output_folder = Path(args.output)
else:
output_folder = downloader.get_output_folder(model, branch, is_lora, is_llamacpp=is_llamacpp)

if args.check:
# Check previously downloaded files
Expand Down
2 changes: 1 addition & 1 deletion instruction-templates/Mistral.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ instruction_template: |-
{{- message['content'] -}}
{%- else -%}
{%- if message['role'] == 'user' -%}
{{-' [INST] ' + message['content'].rstrip() + ' [/INST] '-}}
{{-'[INST] ' + message['content'].rstrip() + ' [/INST]'-}}
{%- else -%}
{{-'' + message['content'] + '</s>' -}}
{%- endif -%}
Expand Down
68 changes: 36 additions & 32 deletions modules/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,53 +166,54 @@ def make_prompt(messages):
prompt = remove_extra_bos(prompt)
return prompt

# Handle truncation
max_length = get_max_prompt_length(state)
prompt = make_prompt(messages)
encoded_length = get_encoded_length(prompt)

while len(messages) > 0 and encoded_length > max_length:
# Handle truncation
if shared.tokenizer is not None:
max_length = get_max_prompt_length(state)
encoded_length = get_encoded_length(prompt)
while len(messages) > 0 and encoded_length > max_length:

# Remove old message, save system message
if len(messages) > 2 and messages[0]['role'] == 'system':
messages.pop(1)
# Remove old message, save system message
if len(messages) > 2 and messages[0]['role'] == 'system':
messages.pop(1)

# Remove old message when no system message is present
elif len(messages) > 1 and messages[0]['role'] != 'system':
messages.pop(0)
# Remove old message when no system message is present
elif len(messages) > 1 and messages[0]['role'] != 'system':
messages.pop(0)

# Resort to truncating the user input
else:
# Resort to truncating the user input
else:

user_message = messages[-1]['content']

# Bisect the truncation point
left, right = 0, len(user_message) - 1

user_message = messages[-1]['content']
while right - left > 1:
mid = (left + right) // 2

# Bisect the truncation point
left, right = 0, len(user_message) - 1
messages[-1]['content'] = user_message[mid:]
prompt = make_prompt(messages)
encoded_length = get_encoded_length(prompt)

while right - left > 1:
mid = (left + right) // 2
if encoded_length <= max_length:
right = mid
else:
left = mid

messages[-1]['content'] = user_message[mid:]
messages[-1]['content'] = user_message[right:]
prompt = make_prompt(messages)
encoded_length = get_encoded_length(prompt)

if encoded_length <= max_length:
right = mid
if encoded_length > max_length:
logger.error(f"Failed to build the chat prompt. The input is too long for the available context length.\n\nTruncation length: {state['truncation_length']}\nmax_new_tokens: {state['max_new_tokens']} (is it too high?)\nAvailable context length: {max_length}\n")
raise ValueError
else:
left = mid
logger.warning(f"The input has been truncated. Context length: {state['truncation_length']}, max_new_tokens: {state['max_new_tokens']}, available context length: {max_length}.")
break

messages[-1]['content'] = user_message[right:]
prompt = make_prompt(messages)
encoded_length = get_encoded_length(prompt)
if encoded_length > max_length:
logger.error(f"Failed to build the chat prompt. The input is too long for the available context length.\n\nTruncation length: {state['truncation_length']}\nmax_new_tokens: {state['max_new_tokens']} (is it too high?)\nAvailable context length: {max_length}\n")
raise ValueError
else:
logger.warning(f"The input has been truncated. Context length: {state['truncation_length']}, max_new_tokens: {state['max_new_tokens']}, available context length: {max_length}.")
break

prompt = make_prompt(messages)
encoded_length = get_encoded_length(prompt)

if also_return_rows:
return prompt, [message['content'] for message in messages]
Expand Down Expand Up @@ -690,6 +691,9 @@ def load_character(character, name1, name2):


def load_instruction_template(template):
if template == 'None':
return ''

for filepath in [Path(f'instruction-templates/{template}.yaml'), Path('instruction-templates/Alpaca.yaml')]:
if filepath.exists():
break
Expand Down
21 changes: 12 additions & 9 deletions modules/exllamav2.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,18 +51,21 @@ def from_pretrained(self, path_to_model):

model = ExLlamaV2(config)

split = None
if shared.args.gpu_split:
split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]

model.load(split)

tokenizer = ExLlamaV2Tokenizer(config)
if shared.args.cache_8bit:
cache = ExLlamaV2Cache_8bit(model)
cache = ExLlamaV2Cache_8bit(model, lazy=True)
else:
cache = ExLlamaV2Cache(model)
cache = ExLlamaV2Cache(model, lazy=True)

if shared.args.autosplit:
model.load_autosplit(cache)
else:
split = None
if shared.args.gpu_split:
split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]

model.load(split)

tokenizer = ExLlamaV2Tokenizer(config)
generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)

result = self()
Expand Down
20 changes: 12 additions & 8 deletions modules/exllamav2_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,18 +37,22 @@ def __init__(self, config: ExLlamaV2Config):
super().__init__(PretrainedConfig())
self.ex_config = config
self.ex_model = ExLlamaV2(config)
split = None
if shared.args.gpu_split:
split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]

self.ex_model.load(split)
self.generation_config = GenerationConfig()
self.loras = None
self.generation_config = GenerationConfig()

if shared.args.cache_8bit:
self.ex_cache = ExLlamaV2Cache_8bit(self.ex_model)
self.ex_cache = ExLlamaV2Cache_8bit(self.ex_model, lazy=True)
else:
self.ex_cache = ExLlamaV2Cache(self.ex_model)
self.ex_cache = ExLlamaV2Cache(self.ex_model, lazy=True)

if shared.args.autosplit:
self.ex_model.load_autosplit(self.ex_cache)
else:
split = None
if shared.args.gpu_split:
split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]

self.ex_model.load(split)

self.past_seq = None
if shared.args.cfg_cache:
Expand Down
2 changes: 2 additions & 0 deletions modules/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
'no_flash_attn',
'num_experts_per_token',
'cache_8bit',
'autosplit',
'alpha_value',
'compress_pos_emb',
'trust_remote_code',
Expand All @@ -89,6 +90,7 @@
'no_flash_attn',
'num_experts_per_token',
'cache_8bit',
'autosplit',
'alpha_value',
'compress_pos_emb',
'exllamav2_info',
Expand Down
2 changes: 1 addition & 1 deletion modules/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def llamacpp_HF_loader(model_name):
path = Path(f'{shared.args.model_dir}/{model_name}')

# Check if a HF tokenizer is available for the model
if all((path / file).exists() for file in ['tokenizer.model', 'tokenizer_config.json']):
if all((path / file).exists() for file in ['tokenizer_config.json']):
logger.info(f'Using tokenizer from: \"{path}\"')
else:
logger.error("Could not load the model because a tokenizer in Transformers format was not found.")
Expand Down
65 changes: 47 additions & 18 deletions modules/models_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,8 @@ def infer_loader(model_name, model_settings):
loader = 'ExLlamav2_HF'
elif (path_to_model / 'quant_config.json').exists() or re.match(r'.*-awq', model_name.lower()):
loader = 'AutoAWQ'
elif len(list(path_to_model.glob('*.gguf'))) > 0 and path_to_model.is_dir() and (path_to_model / 'tokenizer_config.json').exists():
loader = 'llamacpp_HF'
elif len(list(path_to_model.glob('*.gguf'))) > 0:
loader = 'llama.cpp'
elif re.match(r'.*\.gguf', model_name.lower()):
Expand Down Expand Up @@ -225,7 +227,7 @@ def apply_model_settings_to_state(model, state):
loader = model_settings.pop('loader')

# If the user is using an alternative loader for the same model type, let them keep using it
if not (loader == 'ExLlamav2_HF' and state['loader'] in ['GPTQ-for-LLaMa', 'ExLlamav2', 'AutoGPTQ']) and not (loader == 'llama.cpp' and state['loader'] in ['llamacpp_HF', 'ctransformers']):
if not (loader == 'ExLlamav2_HF' and state['loader'] in ['GPTQ-for-LLaMa', 'ExLlamav2', 'AutoGPTQ']) and not (loader == 'llama.cpp' and state['loader'] in ['ctransformers']):
state['loader'] = loader

for k in model_settings:
Expand All @@ -243,27 +245,54 @@ def save_model_settings(model, state):
Save the settings for this model to models/config-user.yaml
'''
if model == 'None':
yield ("Not saving the settings because no model is loaded.")
yield ("Not saving the settings because no model is selected in the menu.")
return

with Path(f'{shared.args.model_dir}/config-user.yaml') as p:
if p.exists():
user_config = yaml.safe_load(open(p, 'r').read())
else:
user_config = {}
user_config = shared.load_user_config()
model_regex = model + '$' # For exact matches
if model_regex not in user_config:
user_config[model_regex] = {}

for k in ui.list_model_elements():
if k == 'loader' or k in loaders.loaders_and_params[state['loader']]:
user_config[model_regex][k] = state[k]

model_regex = model + '$' # For exact matches
if model_regex not in user_config:
user_config[model_regex] = {}
shared.user_config = user_config

for k in ui.list_model_elements():
if k == 'loader' or k in loaders.loaders_and_params[state['loader']]:
user_config[model_regex][k] = state[k]
output = yaml.dump(user_config, sort_keys=False)
p = Path(f'{shared.args.model_dir}/config-user.yaml')
with open(p, 'w') as f:
f.write(output)

shared.user_config = user_config
yield (f"Settings for `{model}` saved to `{p}`.")

output = yaml.dump(user_config, sort_keys=False)
with open(p, 'w') as f:
f.write(output)

yield (f"Settings for `{model}` saved to `{p}`.")
def save_instruction_template(model, template):
'''
Similar to the function above, but it saves only the instruction template.
'''
if model == 'None':
yield ("Not saving the template because no model is selected in the menu.")
return

user_config = shared.load_user_config()
model_regex = model + '$' # For exact matches
if model_regex not in user_config:
user_config[model_regex] = {}

if template == 'None':
user_config[model_regex].pop('instruction_template', None)
else:
user_config[model_regex]['instruction_template'] = template

shared.user_config = user_config

output = yaml.dump(user_config, sort_keys=False)
p = Path(f'{shared.args.model_dir}/config-user.yaml')
with open(p, 'w') as f:
f.write(output)

if template == 'None':
yield (f"Instruction template for `{model}` unset in `{p}`, as the value for template was `{template}`.")
else:
yield (f"Instruction template for `{model}` saved to `{p}` as `{template}`.")
Loading

0 comments on commit dd46229

Please sign in to comment.