--- base_model: unsloth/Qwen3-4B tags: - text-generation-inference - transformers - unsloth - qwen3 - trl - sft license: apache-2.0 language: - en --- # QAT instructions + HF model upload ``` # ========================================================================================= # Fine-tuning script based on https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_%281B_and_3B%29-Conversational.ipynb # This script and HF checkpoint are only intended to showcase how to do finetuning in a way compatible with ExecuTorch # Only 10 steps are done, and quality of the finetuned model is not evaluated # ========================================================================================= from unsloth import FastLanguageModel from unsloth.chat_templates import ( get_chat_template, standardize_data_formats, standardize_sharegpt, train_on_responses_only, ) from datasets import load_dataset from trl import SFTConfig, SFTTrainer from transformers import DataCollatorForSeq2Seq import torch import torch.nn as nn batch_size = 2 learning_rate = 2e-5 gradient_accumulation_steps = 4 max_steps = 10 full_finetuning = True qat_scheme = "int8-int4" output_dir = "/tmp/unsloth_example" model_id = "unsloth/Qwen3-4B" chat_template = "qwen3" max_seq_length = 2048 dtype = torch.bfloat16 load_in_4bit = False ################################################################################ # Define model/tokenizer ################################################################################ model, tokenizer = FastLanguageModel.from_pretrained( model_name=model_id, max_seq_length=max_seq_length, dtype=dtype, load_in_4bit =load_in_4bit, full_finetuning=full_finetuning, qat_scheme=qat_scheme, ) tokenizer = get_chat_template(tokenizer, chat_template = chat_template) print("MODEL AFTER LOADING") print(model) print(model.config) print(model._torchao_config) ################################################################################ # Process dataset ################################################################################ def formatting_prompts_func(examples): convos = examples["conversations"] texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos] return { "text" : texts, } dataset = load_dataset("mlabonne/FineTome-100k", split = "train") dataset = standardize_sharegpt(dataset) dataset = dataset.map(formatting_prompts_func, batched = True,) print("DATASET ENTRY") print(dataset[0]) print("\n\n") ################################################################################ # Define trainer ################################################################################ trainer = SFTTrainer( model=model, tokenizer=tokenizer, train_dataset=dataset, dataset_text_field="text", max_seq_length=max_seq_length, packing=False, args=SFTConfig( per_device_train_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps, warmup_steps=5, num_train_epochs=1, max_steps=max_steps, learning_rate=learning_rate, logging_steps=1, optim="adamw_8bit", weight_decay=0.01, lr_scheduler_type="linear", seed=3407, output_dir="outputs", report_to="none", ), ) ################################################################################ # Do fine tuning ################################################################################ print("DOING FINETUNING") trainer_stats = trainer.train() print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.") print( f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training." ) ################################################################################ # Save model ################################################################################ model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) # ################################################################################ # # Convert model to torchao and save to hub # ################################################################################ from unsloth.models._utils import _convert_torchao_model _convert_torchao_model(model) print('MODEL AFTER CONVERT', model) print("CONFIG", model.config) ################################################################################ # Push converted model to hub ################################################################################ from huggingface_hub import get_token, whoami def _get_username(): token = get_token() username = whoami(token=token)["name"] return username username = _get_username() model_name = model_id.split("/")[-1] save_to = f"{username}/{model_name}-{qat_scheme}-unsloth-v2" model.push_to_hub(save_to, safe_serialization=False) tokenizer.push_to_hub(save_to) ################################################################################ # Load converted from hub and inspect ################################################################################ from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained(save_to) print('model', model) print("model.embed_tokens.weight", model.model.embed_tokens.weight) print("model.layers[0].self_attn.q_proj.weight", model.model.layers[0].self_attn.q_proj.weight) print("lm_head.weight", model.lm_head.weight) ``` # Exporting to ExecuTorch We can run the quantized model on a mobile phone using [ExecuTorch](https://github.com/pytorch/executorch): ```bash # 1. Install ExecuTorch pip install executorch pytorch_tokenizers torchtune # 2. Download QAT'd weights we uploaded to HF python -m executorch.examples.models.qwen3.convert_weights $(hf download metascroy/Qwen3-4B-int8-int4-unsloth-v2) pytorch_model_converted.bin # 3. Download model config from ExecuTorch repo curl -L -o 4b_config.json https://raw.githubusercontent.com/pytorch/executorch/main/examples/models/qwen3/config/4b_config.json # 4. Export to ExecuTorch pte file python -m executorch.examples.models.llama.export_llama \ --model "qwen3_4b" \ --checkpoint pytorch_model_converted.bin \ --params 4b_config.json \ --output_name model.pte \ -kv \ --use_sdpa_with_kv_cache \ -X \ --xnnpack-extended-ops \ --max_context_length 1024 \ --max_seq_length 512 \ --dtype fp32 \ --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}' # 5. (optional) Upload pte file to HuggingFace hf upload metascroy/Qwen3-4B-int8-int4-unsloth-v2 model.pte ``` # Running in a mobile app After that you can run the model in an iOS mobile app using the [executorch-examples repo](https://github.com/meta-pytorch/executorch-examples). First clone the repo and open the Xcode project: ``` git clone https://github.com/meta-pytorch/executorch-examples.git open executorch-examples/llm/apple/etLLM.xcodeproj ``` Once open, connect your iPhone and select it as the device in Xcode. To build the app, press the play button. (This does require you sign the code.) To run the model we just finetuned, you need to transfer the model.pte file and tokenizer.json to your phone. To do this, open finder, select your phone, and drag and drop the files to copy them over. Please rename the pte file to qwen3_model.pte before copying it over because the demo app requires the name begin with qwen3 to use the correct prompt template. In the etLLM app, you can select the model and tokenizer to use by browsing your file system for the "qwen3_model.pte" and "tokenizer.json" files we just copied to the phone. # Uploaded model - **Developed by:** metascroy - **License:** apache-2.0 - **Finetuned from model :** unsloth/Qwen3-4B This qwen3 model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library. [](https://github.com/unslothai/unsloth)