1️⃣Quantization Basic
Quantization 양자화 기본 이론
기본은 모델의 파라미터 구성을 압축하는 것 입니다.
압축 시에 연결을 제거하는 가지치기 Pruning 방법론 사용합니다.
Knowledge Distillation (지식 전이) 방식은 원래 모델(Instructor)을 사용하여 더 작은 모델(Student)을 훈련모델을 압축 할 수 있습니다.
Quantization의 기본 원리는 모델 파라미터의 데이터 형식을 더 작게 만드는 것 입니다. (Low Precision)
실수형(Floating Point)는 bit 별로 FP32, F16, BF16으로 표현되며, 정수형(Integer)는 int32, int16, int8으로 표현됩니다.
Pytorch: Data Types and Sizes 실습
머신러닝 모델의 매개변수를 저장하는 데 사용되는 일반적인 데이터 유형에 대해 알아봅니다.
import torch
Integers
# Information of `8-bit unsigned integer`
torch.iinfo(torch.uint8)
> iinfo(min=0, max=255, dtype=uint8)
# Information of `8-bit (signed) integer`
torch.iinfo(torch.int8)
>iinfo(min=-128, max=127, dtype=int8)
### Information of `16-bit (signed) integer`
torch.iinfo(torch.int16)
> iinfo(min=-32768, max=32767, dtype=int16)
### Information of `32-bit (signed) integer`
torch.iinfo(torch.int32)
> iinfo(min=-2.14748e+09, max=2.14748e+09, dtype=int32)
### Information of `64-bit (signed) integer`
torch.iinfo(torch.int64)
> iinfo(min=-9.22337e+18, max=9.22337e+18, dtype=int64)
Floating Points
# by default, python stores float data in fp64
value = 1/3
format(value, '.60f')
> '0.333333333333333314829616256247390992939472198486328125000000'
# 64-bit floating point
tensor_fp64 = torch.tensor(value, dtype = torch.float64)
print(f"fp64 tensor: {format(tensor_fp64.item(), '.60f')}")
> fp64 tensor: 0.333333333333333314829616256247390992939472198486328125000000
tensor_fp32 = torch.tensor(value, dtype = torch.float32)
tensor_fp16 = torch.tensor(value, dtype = torch.float16)
tensor_bf16 = torch.tensor(value, dtype = torch.bfloat16)
print(f"fp64 tensor: {format(tensor_fp64.item(), '.60f')}")
print(f"fp32 tensor: {format(tensor_fp32.item(), '.60f')}")
print(f"fp16 tensor: {format(tensor_fp16.item(), '.60f')}")
print(f"bf16 tensor: {format(tensor_bf16.item(), '.60f')}")
> fp64 tensor: 0.333333333333333314829616256247390992939472198486328125000000
fp32 tensor: 0.333333343267440795898437500000000000000000000000000000000000
fp16 tensor: 0.333251953125000000000000000000000000000000000000000000000000
bf16 tensor: 0.333984375000000000000000000000000000000000000000000000000000
# Information of `16-bit brain floating point`
torch.finfo(torch.bfloat16)
> finfo(resolution=0.01, min=-3.38953e+38, max=3.38953e+38, eps=0.0078125, smallest_normal=1.17549e-38, tiny=1.17549e-38, dtype=bfloat16)
### Information of `16-bit floating point`
torch.finfo(torch.float16)
> finfo(resolution=0.001, min=-65504, max=65504, eps=0.000976562, smallest_normal=6.10352e-05, tiny=6.10352e-05, dtype=float16)
# Information of `32-bit floating point`
torch.finfo(torch.float32)
> finfo(resolution=1e-06, min=-3.40282e+38, max=3.40282e+38, eps=1.19209e-07, smallest_normal=1.17549e-38, tiny=1.17549e-38, dtype=float32)
### Information of `64-bit floating point`
torch.finfo(torch.float64)
> finfo(resolution=1e-15, min=-1.79769e+308, max=1.79769e+308, eps=2.22045e-16, smallest_normal=2.22507e-308, tiny=2.22507e-308, dtype=float64)
Downcasting
# random pytorch tensor: float32, size=1000
tensor_fp32 = torch.rand(1000, dtype = torch.float32)
# first 5 elements of the random tensor
tensor_fp32[:5]
> tensor([0.6183, 0.6456, 0.0934, 0.3034, 0.5854])
# downcast the tensor to bfloat16 using the "to" method
tensor_fp32_to_bf16 = tensor_fp32.to(dtype = torch.bfloat16)
tensor_fp32_to_bf16[:5]
> tensor([0.6172, 0.6445, 0.0933, 0.3027, 0.5859], dtype=torch.bfloat16)
# tensor_fp32 x tensor_fp32
m_float32 = torch.dot(tensor_fp32, tensor_fp32)
m_float32
> tensor(344.0361)
# tensor_fp32_to_bf16 x tensor_fp32_to_bf16
m_bfloat16 = torch.dot(tensor_fp32_to_bf16, tensor_fp32_to_bf16)
m_bfloat16
> tensor(344., dtype=torch.bfloat16)
ML Models with Different Data Types
다양한 데이터 유형으로 ML 모델을 로드합니다.
import torch
import torch.nn as nn
import requests
from PIL import Image
import warnings
# Ignore specific UserWarnings related to max_length in transformers
warnings.filterwarnings("ignore",
message=".*Using the model-agnostic default `max_length`.*")
class DummyModel(nn.Module):
"""
A dummy model that consists of an embedding layer
with two blocks of a linear layer followed by a layer
norm layer.
"""
def __init__(self):
super().__init__()
torch.manual_seed(123)
self.token_embedding = nn.Embedding(2, 2)
# Block 1
self.linear_1 = nn.Linear(2, 2)
self.layernorm_1 = nn.LayerNorm(2)
# Block 2
self.linear_2 = nn.Linear(2, 2)
self.layernorm_2 = nn.LayerNorm(2)
self.head = nn.Linear(2, 2)
def forward(self, x):
hidden_states = self.token_embedding(x)
# Block 1
hidden_states = self.linear_1(hidden_states)
hidden_states = self.layernorm_1(hidden_states)
# Block 2
hidden_states = self.linear_2(hidden_states)
hidden_states = self.layernorm_2(hidden_states)
logits = self.head(hidden_states)
return logits
model = DummyModel()
model
DummyModel(
(token_embedding): Embedding(2, 2)
(linear_1): Linear(in_features=2, out_features=2, bias=True)
(layernorm_1): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
(linear_2): Linear(in_features=2, out_features=2, bias=True)
(layernorm_2): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
(head): Linear(in_features=2, out_features=2, bias=True)
)
모델에 있는 매개변수의 데이터 유형을 검사하는 함수를 만듭니다.
def print_param_dtype(model):
for name, param in model.named_parameters():
print(f"{name} is loaded in {param.dtype}")
print_param_dtype(model)
> token_embedding.weight is loaded in torch.float32
linear_1.weight is loaded in torch.float32
linear_1.bias is loaded in torch.float32
layernorm_1.weight is loaded in torch.float32
layernorm_1.bias is loaded in torch.float32
linear_2.weight is loaded in torch.float32
linear_2.bias is loaded in torch.float32
layernorm_2.weight is loaded in torch.float32
layernorm_2.bias is loaded in torch.float32
head.weight is loaded in torch.float32
head.bias is loaded in torch.float32
Model Casting: float16
float16
Cast the model into a different precision.
# float 16
model_fp16 = DummyModel().half()
매개변수의 데이터 유형을 검사합니다.
print_param_dtype(model_fp16)
> token_embedding.weight is loaded in torch.float16
linear_1.weight is loaded in torch.float16
linear_1.bias is loaded in torch.float16
layernorm_1.weight is loaded in torch.float16
layernorm_1.bias is loaded in torch.float16
linear_2.weight is loaded in torch.float16
linear_2.bias is loaded in torch.float16
layernorm_2.weight is loaded in torch.float16
layernorm_2.bias is loaded in torch.float16
head.weight is loaded in torch.float16
head.bias is loaded in torch.float16
model_fp16
> DummyModel(
(token_embedding): Embedding(2, 2)
(linear_1): Linear(in_features=2, out_features=2, bias=True)
(layernorm_1): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
(linear_2): Linear(in_features=2, out_features=2, bias=True)
(layernorm_2): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
(head): Linear(in_features=2, out_features=2, bias=True)
)
모델을 사용하여 간단한 추론을 실행합니다.
import torch
dummy_input = torch.LongTensor([[1, 0], [0, 1]])
# inference using float32 model
logits_fp32 = model(dummy_input)
logits_fp32
> tensor([[[-0.6872, 0.7132],
[-0.6872, 0.7132]],
[[-0.6872, 0.7132],
[-0.6872, 0.7132]]], grad_fn=<ViewBackward0>)
# inference using float16 model
try:
logits_fp16 = model_fp16(dummy_input)
except Exception as error:
print("\033[91m", type(error).__name__, ": ", error, "\033[0m")
> [91m RuntimeError : "LayerNormKernelImpl" not implemented for 'Half' [0m
Model Casting: bfloat16
bfloat16
from copy import deepcopy
model_bf16 = deepcopy(model)
model_bf16 = model_bf16.to(torch.bfloat16)
print_param_dtype(model_bf16)
logits_bf16 = model_bf16(dummy_input)
> token_embedding.weight is loaded in torch.bfloat16
linear_1.weight is loaded in torch.bfloat16
linear_1.bias is loaded in torch.bfloat16
layernorm_1.weight is loaded in torch.bfloat16
layernorm_1.bias is loaded in torch.bfloat16
linear_2.weight is loaded in torch.bfloat16
linear_2.bias is loaded in torch.bfloat16
layernorm_2.weight is loaded in torch.bfloat16
layernorm_2.bias is loaded in torch.bfloat16
head.weight is loaded in torch.bfloat16
head.bias is loaded in torch.bfloat16
이제 logits_fp32와 logits_bf16의 차이를 비교해 보겠습니다.
mean_diff = torch.abs(logits_bf16 - logits_fp32).mean().item()
max_diff = torch.abs(logits_bf16 - logits_fp32).max().item()
print(f"Mean diff: {mean_diff} | Max diff: {max_diff}")
> Mean diff: 0.000997886061668396 | Max diff: 0.0016907453536987305
LLM Models in Different Data Types
from transformers import BlipForConditionalGeneration
model_name = "Salesforce/blip-image-captioning-base"
model = BlipForConditionalGeneration.from_pretrained(model_name)
# inspect the default data types of the model
# print_param_dtype(model)
모델의 메모리 사용량을 확인합니다.
fp32_mem_footprint = model.get_memory_footprint()
print("Footprint of the fp32 model in bytes: ",
fp32_mem_footprint)
print("Footprint of the fp32 model in MBs: ",
fp32_mem_footprint/1e+6)
> Footprint of the fp32 model in bytes: 989660400
Footprint of the fp32 model in MBs: 989.6604
bfloat16
에서 동일한 모델을 로드합니다.
model_bf16 = BlipForConditionalGeneration.from_pretrained(
model_name,
torch_dtype=torch.bfloat16
)
> bf16_mem_footprint = model_bf16.get_memory_footprint()
# Get the relative difference
relative_diff = bf16_mem_footprint / fp32_mem_footprint
print("Footprint of the bf16 model in MBs: ",
bf16_mem_footprint/1e+6)
print(f"Relative diff: {relative_diff}")
> Footprint of the bf16 model in MBs: 494.832248
Relative diff: 0.5000020693967345
Model Performance: float32
vs bfloat16
float32
vs bfloat16
이제 두 모델의 생성 결과를 비교해 보겠습니다.
def get_generation(model, processor, image, dtype):
inputs = processor(image, return_tensors="pt").to(dtype)
out = model.generate(**inputs)
return processor.decode(out[0], skip_special_tokens=True)
def load_image(img_url):
image = Image.open(requests.get(
img_url, stream=True).raw).convert('RGB')
return image
from transformers import BlipProcessor
processor = BlipProcessor.from_pretrained(model_name)
from IPython.display import display
img_url = 'https://storage.googleapis.com/\
sfr-vision-language-research/BLIP/demo.jpg'
image = load_image(img_url)
display(image.resize((500, 350)))
results_fp32 = get_generation(model,
processor,
image,
torch.float32)
print("fp32 Model Results:\n", results_fp32)
> fp32 Model Results:
a woman sitting on the beach with her dog
results_bf16 = get_generation(model_bf16,
processor,
image,
torch.bfloat16)
print("bf16 Model Results:\n", results_bf16)
> bf16 Model Results:
a woman sitting on the beach with a dog
Default Data Type
허깅페이스 트랜스포머 라이브러리의 경우, 모델을 로드할 기본 데이터 유형은
float32
입니다.'기본 데이터 유형'을 원하는 것으로 설정할 수 있습니다.
desired_dtype = torch.bfloat16
torch.set_default_dtype(desired_dtype)
dummy_model_bf16 = DummyModel()
print_param_dtype(dummy_model_bf16)
> token_embedding.weight is loaded in torch.bfloat16
linear_1.weight is loaded in torch.bfloat16
linear_1.bias is loaded in torch.bfloat16
layernorm_1.weight is loaded in torch.bfloat16
layernorm_1.bias is loaded in torch.bfloat16
linear_2.weight is loaded in torch.bfloat16
linear_2.bias is loaded in torch.bfloat16
layernorm_2.weight is loaded in torch.bfloat16
layernorm_2.bias is loaded in torch.bfloat16
head.weight is loaded in torch.bfloat16
head.bias is loaded in torch.bfloat16
마찬가지로 기본 데이터 유형을 float32로 재설정할 수 있습니다.
torch.set_default_dtype(torch.float32)
print_param_dtype(dummy_model_bf16)
> token_embedding.weight is loaded in torch.bfloat16
linear_1.weight is loaded in torch.bfloat16
linear_1.bias is loaded in torch.bfloat16
layernorm_1.weight is loaded in torch.bfloat16
layernorm_1.bias is loaded in torch.bfloat16
linear_2.weight is loaded in torch.bfloat16
linear_2.bias is loaded in torch.bfloat16
layernorm_2.weight is loaded in torch.bfloat16
layernorm_2.bias is loaded in torch.bfloat16
head.weight is loaded in torch.bfloat16
head.bias is loaded in torch.bfloat16
Last updated