reupdate

479f9de1 · Cao Duc Anh · 7a3e3b97 · 479f9de1 · 479f9de1 · 479f9de1
Commit 479f9de1 authored Apr 17, 2024 by Cao Duc Anh
10 changed files
--- a/config/base.yml
+++ b/config/base.yml
+# change to list chars of your dataset or use default vietnamese chars
+vocab: 'aAàÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬbBcCdDđĐeEèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆfFgGhHiIìÌỉỈĩĨíÍịỊjJkKlLmMnNoOòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢpPqQrRsStTuUùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰvVwWxXyYỳỲỷỶỹỸýÝỵỴzZ0123456789!"#$%&''()*+,-./:;<=>?@[\]^_`{|}~ '
+
+# cpu, cuda, cuda:0
+device: cuda:0
+
+seq_modeling: transformer
+transformer:  
+    d_model: 256
+    nhead: 8
+    num_encoder_layers: 6
+    num_decoder_layers: 6
+    dim_feedforward: 2048
+    max_seq_length: 1024
+    pos_dropout: 0.1
+    trans_dropout: 0.1
+
+optimizer:
+    max_lr: 0.0003 
+    pct_start: 0.1
+
+trainer:
+    batch_size: 32
+    print_every: 200
+    valid_every: 4000
+    iters: 100000
+    # where to save our model for prediction
+    export: ./weights/transformerocr.pth
+    checkpoint: ./checkpoint/transformerocr_checkpoint.pth
+    log: ./train.log
+    # null to disable compuate accuracy, or change to number of sample to enable validiation while training
+    metrics: null
+
+dataset:    
+    # name of your dataset
+    name: data
+    # path to annotation and image
+    data_root: ./img/
+    train_annotation: annotation_train.txt
+    valid_annotation: annotation_val_small.txt
+    # resize image to 32 height, larger height will increase accuracy
+    image_height: 32
+    image_min_width: 32
+    image_max_width: 512
+
+dataloader:
+    num_workers: 3
+    pin_memory: True
+
+aug:
+    image_aug: true
+    masked_language_model: true
+
+predictor:
+    # disable or enable beamsearch while prediction, use beamsearch will be slower
+    beamsearch: False
+
+quiet: False 
\ No newline at end of file
--- a/config/vgg-seq2seq.yml
+++ b/config/vgg-seq2seq.yml
+pretrain: 
+    id_or_url: 1nTKlEog9YFK74kPyX0qLwCWi60_YHHk4
+    md5: efcabaa6d3adfca8e52bda2fd7d2ee04
+    cached: /tmp/tranformerorc.pth
+
+# url or local path
+weights: https://drive.google.com/uc?id=1nTKlEog9YFK74kPyX0qLwCWi60_YHHk4
+
+backbone: vgg19_bn
+cnn:
+    # pooling stride size
+    ss:
+        - [2, 2]
+        - [2, 2]
+        - [2, 1]
+        - [2, 1]
+        - [1, 1]         
+    # pooling kernel size 
+    ks:
+        - [2, 2]
+        - [2, 2]
+        - [2, 1]
+        - [2, 1]
+        - [1, 1]
+    # dim of ouput feature map
+    hidden: 256
+
+seq_modeling: seq2seq
+transformer:
+    encoder_hidden: 256
+    decoder_hidden: 256
+    img_channel: 256
+    decoder_embedded: 256
+    dropout: 0.1
+
+optimizer:
+    max_lr: 0.001
+    pct_start: 0.1
\ No newline at end of file
--- a/config/vgg-transformer.yml
+++ b/config/vgg-transformer.yml
+# for training
+pretrain: https://vocr.vn/data/vietocr/vgg_transformer.pth
+
+# url or local path (predict)
+weights: https://vocr.vn/data/vietocr/vgg_transformer.pth
+
+backbone: vgg19_bn
+cnn:
+    pretrained: True
+    # pooling stride size
+    ss:
+        - [2, 2]
+        - [2, 2]
+        - [2, 1]
+        - [2, 1]
+        - [1, 1]         
+    # pooling kernel size 
+    ks:
+        - [2, 2]
+        - [2, 2]
+        - [2, 1]
+        - [2, 1]
+        - [1, 1]
+    # dim of ouput feature map
+    hidden: 256
+
+seq_modeling: transformer
+transformer:
+    encoder_hidden: 256
+    decoder_hidden: 256
+    img_channel: 256
+    decoder_embedded: 256
+    dropout: 0.1
+
+optimizer:
+    max_lr: 0.001
+    pct_start: 0.1
\ No newline at end of file
--- a/inferonnx_croptext.ipynb
+++ b/inferonnx_croptext.ipynb
@@ -214,7 +214,7 @@
    "    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border\n",
    "    return im, r, (dw, dh)\n",
    "\n",
-    "names = ['id', 'name', 'birth', 'sex', 'nation', 'origin', 'origin1', 'r1', 'r2']\n",
+    "names = [\"birth\", \"id\", \"name\", \"nation\", \"origin\", \"origin1\", \"r1\", \"r2\", \"sex\"]\n",
    "colors = {name:[random.randint(0, 255) for _ in range(3)] for i,name in enumerate(names)}\n",
    "\n",
    "img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n",

--- a/infertorchscript_ocr.ipynb
+++ b/infertorchscript_ocr.ipynb
@@ -366,9 +366,9 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "model_cnn = torch.jit.load(\"/home/anhcd/Projects/dac_vietocr/ConvertVietOcr2Onnx/weight/cnn.cpu.torchscript\").to(device)\n",
-    "model_encoder = torch.jit.load(\"/home/anhcd/Projects/dac_vietocr/ConvertVietOcr2Onnx/weight/encoder.cpu.torchscript\").to(device)\n",
-    "model_decoder = torch.jit.load(\"/home/anhcd/Projects/dac_vietocr/ConvertVietOcr2Onnx/weight/decoder.cpu.torchscript\").to(device)"
+    "model_cnn = torch.jit.load(\"/home/anhcd/Projects/dac_vietocr/ConvertVietOcr2Onnx/weight/cnn.torchscript\").to(device)\n",
+    "model_encoder = torch.jit.load(\"/home/anhcd/Projects/dac_vietocr/ConvertVietOcr2Onnx/weight/encoder.torchscript\").to(device)\n",
+    "model_decoder = torch.jit.load(\"/home/anhcd/Projects/dac_vietocr/ConvertVietOcr2Onnx/weight/decoder.torchscript\").to(device)"
   ]
  },
  {

--- a/poc_note/test_detect_idcard.ipynb
+++ b/poc_note/test_detect_idcard.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Errno 2] No such file or directory: 'yolov7'\n",
+      "/home/anhcd/Projects/dac_vietocr/yolov7\n"
+     ]
+    }
+   ],
+   "source": [
+    "%cd yolov7\n",
+    "import cv2\n",
+    "import torch\n",
+    "import numpy as np\n",
+    "import os\n",
+    "\n",
+    "from models.experimental import attempt_load\n",
+    "from utils.datasets import letterbox\n",
+    "from utils.general import non_max_suppression, scale_coords\n",
+    "from utils.plots import plot_one_box\n",
+    "\n",
+    "device = torch.device(\"cpu\")\n",
+    "half = device.type != \"cpu\"\n",
+    "\n",
+    "CATEGORIES = [\"bottom_left\",\"bottom_right\", \"top_left\", \"top_right\"]\n",
+    "COLORS = [(66, 135, 245), (194, 66, 245), (250, 52, 72), (111, 250, 52)]\n",
+    "\n",
+    "\n",
+    "def load_model(path):\n",
+    "    model = attempt_load(path, map_location=device)\n",
+    "\n",
+    "    if half:\n",
+    "        model.half() \n",
+    "    #model = torch.load(path, map_location=torch.device('cpu'))\n",
+    "\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "def convert_img(img, device, half, new_size=416):\n",
+    "    img = letterbox(img, new_shape=new_size)[0]\n",
+    "    img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416\n",
+    "    img = np.ascontiguousarray(img)\n",
+    "    img = torch.from_numpy(img).to(device)\n",
+    "\n",
+    "    img = img.half() if half else img.float()\n",
+    "    img = img / 255.0\n",
+    "\n",
+    "    if img.ndimension() == 3:\n",
+    "        img = img.unsqueeze(0)\n",
+    "\n",
+    "    return img\n",
+    "\n",
+    "\n",
+    "def predict_4_corners(img, model, resized_width=640):\n",
+    "    if resized_width is not None:\n",
+    "        orig_img = cv2.resize(img, width=640)\n",
+    "    else:\n",
+    "        orig_img = cv2.imread(img).copy()\n",
+    "\n",
+    "    plot_img = orig_img.copy()\n",
+    "    img = convert_img(orig_img, device, half, new_size=640)\n",
+    "    _, _, new_height, new_width = img.size()\n",
+    "\n",
+    "    preds = model(img)[0]\n",
+    "    preds = non_max_suppression(preds, 0.2, 0.2)\n",
+    "    result = np.array([], dtype=np.float32)\n",
+    "\n",
+    "    # based on YOLOv7\n",
+    "    for i, det in enumerate(preds):  # detections per image\n",
+    "        if det is not None:\n",
+    "            # Rescale boxes from img_size to im0 sizes\n",
+    "            det[:, :4] = scale_coords(img.size()[2:], det[:, :4], orig_img.shape).round()\n",
+    "            result = det.type(torch.float32).cpu().detach().numpy()\n",
+    "\n",
+    "            # for visualization only\n",
+    "            for *xyxy, conf, cls in reversed(det):\n",
+    "                label = \"%s %.2f\" % (CATEGORIES[int(cls)], conf)\n",
+    "                plot_one_box(xyxy, plot_img, label=label, color=COLORS[int(cls)], line_thickness=2)\n",
+    "\n",
+    "    return result, orig_img, plot_img\n",
+    "\n",
+    "\n",
+    "def filter_redundancy(result):\n",
+    "    deleted_indexes = []\n",
+    "\n",
+    "    for i, element in enumerate(result):\n",
+    "        category = element[5]\n",
+    "        max_class_confidence_score = max(result[np.where(result[:, 5] == category)][:, 4])\n",
+    "\n",
+    "        if element[4] < max_class_confidence_score:\n",
+    "            deleted_indexes.append(i)\n",
+    "\n",
+    "    return np.delete(result, deleted_indexes, axis=0)\n",
+    "\n",
+    "\n",
+    "def get_center_point(coordinate_dict):\n",
+    "    di = dict()\n",
+    "\n",
+    "    for key in coordinate_dict.keys():\n",
+    "        xmin, ymin, xmax, ymax = coordinate_dict[key]\n",
+    "        x_center = (xmin + xmax) / 2\n",
+    "        y_center = (ymin + ymax) / 2\n",
+    "        di[key] = (x_center, y_center)\n",
+    "\n",
+    "    return di\n",
+    "\n",
+    "def get_point(coordinate_dict):\n",
+    "    di = dict()\n",
+    "\n",
+    "    for key in coordinate_dict.keys():\n",
+    "        xmin, ymin, xmax, ymax = coordinate_dict[key]\n",
+    "        if key == 'bottom_left':\n",
+    "          di[key] = (xmin, ymin + 50)\n",
+    "        elif key == 'bottom_right':\n",
+    "          di[key] = (xmax, ymin + 50)\n",
+    "        elif key == 'top_left':\n",
+    "          di[key] = (xmin, ymax - 25)\n",
+    "        elif key == 'top_right':\n",
+    "          di[key] = (xmax, ymax - 25)\n",
+    "\n",
+    "    return di\n",
+    "\n",
+    "\n",
+    "def dictionary(result):\n",
+    "    diction = {}\n",
+    "    for det in result:\n",
+    "        if int(det[5]) == 0:\n",
+    "              a = 'bottom_left'\n",
+    "        elif int(det[5]) == 1:\n",
+    "              a = 'bottom_right'\n",
+    "        elif int(det[5]) == 2:\n",
+    "              a = 'top_left'\n",
+    "        elif int(det[5]) == 3:\n",
+    "              a = 'top_right'      \n",
+    "        label = [\n",
+    "            det[0], \n",
+    "            det[1], \n",
+    "            det[2], \n",
+    "            det[3],         \n",
+    "        ]\n",
+    "        diction[a] = label\n",
+    "\n",
+    "    return diction\n",
+    "\n",
+    "def find_miss_corner(coordinate_dict):\n",
+    "    position_name = ['bottom_left', 'bottom_right','top_left', 'top_right']\n",
+    "    position_index = np.array([0, 0, 0, 0])\n",
+    "\n",
+    "    for name in coordinate_dict.keys():\n",
+    "        if name in position_name:\n",
+    "            position_index[position_name.index(name)] = 1\n",
+    "\n",
+    "    index = np.argmin(position_index)\n",
+    "\n",
+    "    return index\n",
+    "\n",
+    "\n",
+    "def calculate_missed_coord_corner(coordinate_dict):\n",
+    "    thresh = 0\n",
+    "\n",
+    "    index = find_miss_corner(coordinate_dict)\n",
+    "\n",
+    "    # calculate missed corner coordinate\n",
+    "    # case 1: missed corner is \"top_left\"\n",
+    "    \n",
+    "    if index == 0:  # \"bottom_left\"\n",
+    "        midpoint = np.add(coordinate_dict['top_left'], coordinate_dict['bottom_right']) / 2\n",
+    "        y = 2 * midpoint[1] - coordinate_dict['top_right'][1] - thresh\n",
+    "        x = 2 * midpoint[0] - coordinate_dict['top_right'][0] - thresh\n",
+    "        coordinate_dict['bottom_left'] = (x, y)\n",
+    "    elif index == 1:  # \"bottom_right\"\n",
+    "        midpoint = np.add(coordinate_dict['bottom_left'], coordinate_dict['top_right']) / 2\n",
+    "        y = 2 * midpoint[1] - coordinate_dict['top_left'][1] - thresh\n",
+    "        x = 2 * midpoint[0] - coordinate_dict['top_left'][0] - thresh\n",
+    "        coordinate_dict['bottom_right'] = (x, y)\n",
+    "    elif index == 2:\n",
+    "        midpoint = np.add(coordinate_dict['top_right'], coordinate_dict['bottom_left']) / 2\n",
+    "        y = 2 * midpoint[1] - coordinate_dict['bottom_right'][1] - thresh\n",
+    "        x = 2 * midpoint[0] - coordinate_dict['bottom_right'][0] - thresh\n",
+    "        coordinate_dict['top_left'] = (x, y)\n",
+    "    elif index == 3:  # \"top_right\"\n",
+    "        midpoint = np.add(coordinate_dict['top_left'], coordinate_dict['bottom_right']) / 2\n",
+    "        y = 2 * midpoint[1] - coordinate_dict['bottom_left'][1] - thresh\n",
+    "        x = 2 * midpoint[0] - coordinate_dict['bottom_left'][0] - thresh\n",
+    "        coordinate_dict['top_right'] = (x, y)\n",
+    "    return coordinate_dict\n",
+    "\n",
+    "\n",
+    "def perspective_transform(image, source_points):\n",
+    "    dest_points = np.float32([[0, 0], [500, 0], [500, 300], [0, 300]])\n",
+    "    M = cv2.getPerspectiveTransform(source_points, dest_points)\n",
+    "    dst = cv2.warpPerspective(image, M, (500, 300))\n",
+    "\n",
+    "    return dst\n",
+    "\n",
+    "\n",
+    "\n",
+    "def warp_identity_card(img, model):\n",
+    "    result, orig_img, plot_img = predict_4_corners(img, model, None)\n",
+    "    if len(result) > 4:\n",
+    "        result = filter_redundancy(result)\n",
+    "\n",
+    "    result = dictionary(result)        \n",
+    "    result = get_point(result) \n",
+    "\n",
+    "    if len(result) < 3:\n",
+    "        print('Please try again')\n",
+    "        return plot_img\n",
+    "    \n",
+    "    if len(result) == 3:\n",
+    "        result = calculate_missed_coord_corner(result)   \n",
+    "\n",
+    "    bottom_right_point = result['bottom_right']\n",
+    "    bottom_left_point = result['bottom_left']\n",
+    "    top_left_point = result['top_left']\n",
+    "    top_right_point = result['top_right']\n",
+    "\n",
+    "    source_points = np.float32([top_left_point,top_right_point,bottom_right_point,bottom_left_point])\n",
+    "    #img = cv2.imread(orig_img)\n",
+    "    crop = perspective_transform(orig_img, source_points)\n",
+    "\n",
+    "    return crop\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Xử lý xoay ảnh "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fusing layers... \n",
+      "RepConv.fuse_repvgg_block\n",
+      "RepConv.fuse_repvgg_block\n",
+      "RepConv.fuse_repvgg_block\n",
+      "IDetect.fuse\n",
+      "Please try again\n",
+      "Please try again\n",
+      "Please try again\n",
+      "Please try again\n",
+      "Please try again\n",
+      "Please try again\n",
+      "Please try again\n"
+     ]
+    }
+   ],
+   "source": [
+    "import glob\n",
+    "import os\n",
+    "import cv2\n",
+    "import numpy as np\n",
+    "\n",
+    "import os\n",
+    "img_paths = []\n",
+    "dir = \"/home/anhcd/Projects/dac_vietocr/img_ocr/ducanhcao\"\n",
+    "for file in os.listdir(dir):\n",
+    "    img_paths.append(os.path.join(dir, file))\n",
+    "\n",
+    "model = load_model(\"/home/anhcd/Projects/dac_vietocr/model/cccd_4_conner.pt\")\n",
+    "\n",
+    "for img_path in img_paths:\n",
+    "    img_name = img_path.split(\"/\")[-1].split(\".\")[0]\n",
+    "    \n",
+    "    crop = warp_identity_card(img_path, model)\n",
+    "    \n",
+    "    cv2.imshow(f\"{img_name}\", crop)  \n",
+    "    cv2.waitKey(1000)\n",
+    "    # cv2.imwrite(\"/home/anhcd/Projects/IEcccd/img/\" + img_name+ \".jpg\", crop)\n",
+    "\n",
+    "cv2.destroyAllWindows()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Xử lý các thông tin bên trong"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "categories_text = ['id', 'name', 'birth', 'sex', 'nation', 'origin', 'origin1', 'r1', 'r2']\n",
+    "\n",
+    "def predict_text(img, model, resized_width=640):\n",
+    "      if resized_width is not None:\n",
+    "            orig_img = cv2.resize(img, width=640)\n",
+    "      else:\n",
+    "            orig_img = cv2.imread(img).copy()\n",
+    "\n",
+    "      plot_img = orig_img.copy()\n",
+    "      img = convert_img(orig_img, device, half, new_size=640)\n",
+    "      _, _, new_height, new_width = img.size()\n",
+    "\n",
+    "      preds = model(img)[0]\n",
+    "      print(preds)\n",
+    "      preds = non_max_suppression(preds, 0.2, 0.2)\n",
+    "      result = np.array([], dtype=np.float32)\n",
+    "\n",
+    "      # based on YOLOv7\n",
+    "      for i, det in enumerate(preds):  # detections per image\n",
+    "            if det is not None:\n",
+    "                  # Rescale boxes from img_size to im0 sizes\n",
+    "                  det[:, :4] = scale_coords(img.size()[2:], det[:, :4], orig_img.shape).round()\n",
+    "                  result = det.type(torch.float32).cpu().detach().numpy()\n",
+    "\n",
+    "                  # for visualization only\n",
+    "                  #for *xyxy, conf, cls in reversed(det):\n",
+    "                        # label = '%s %.2f' % (CATEGORIES[int(cls)], conf)\n",
+    "                        #plot_one_box(xyxy, plot_img, label=label, color=COLORS[int(cls)], line_thickness=2) '''\n",
+    "\n",
+    "      return result, orig_img, plot_img \n",
+    "\n",
+    "def dictionary_text(result):\n",
+    "      diction = {}\n",
+    "      for det in result:\n",
+    "            if int(det[5]) == 0:\n",
+    "                  a = 'birth'\n",
+    "            elif int(det[5]) == 1:\n",
+    "                  a = 'id'\n",
+    "            elif int(det[5]) == 2:\n",
+    "                  a = 'name'\n",
+    "            elif int(det[5]) == 3:\n",
+    "                  a = 'nation'      \n",
+    "            elif int(det[5]) == 4:\n",
+    "                  a = 'origin'  \n",
+    "            elif int(det[5]) == 5:\n",
+    "                  a = 'origin1'  \n",
+    "            elif int(det[5]) == 6:\n",
+    "                  a = 'r1'  \n",
+    "            elif int(det[5]) == 7:\n",
+    "                  a = 'r2'  \n",
+    "            elif int(det[5]) == 8:\n",
+    "                  a = 'sex'  \n",
+    "            label = [\n",
+    "            det[0], \n",
+    "            det[1],\n",
+    "            det[2], \n",
+    "            det[3]      \n",
+    "            ]\n",
+    "            diction[a] = label\n",
+    "\n",
+    "      return diction\n",
+    "\n",
+    "def perspective_transform_text(image, source_points):\n",
+    "      dest_points = np.float32([[0, 0], [800, 0], [800, 100], [0, 100]])\n",
+    "      M = cv2.getPerspectiveTransform(source_points, dest_points)\n",
+    "      dst = cv2.warpPerspective(image, M, (800, 100))\n",
+    "\n",
+    "      return dst"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fusing layers... \n",
+      "RepConv.fuse_repvgg_block\n",
+      "RepConv.fuse_repvgg_block\n",
+      "RepConv.fuse_repvgg_block\n",
+      "IDetect.fuse\n",
+      "img_paths: ['/home/anhcd/Projects/dac_vietocr/img_ocr/ducanhcao/ducanhcaocccd1nation.jpg', '/home/anhcd/Projects/dac_vietocr/img_ocr/ducanhcao/ducanhcaocccd1birth.jpg', '/home/anhcd/Projects/dac_vietocr/img_ocr/ducanhcao/ducanhcaocccd1name.jpg', '/home/anhcd/Projects/dac_vietocr/img_ocr/ducanhcao/ducanhcaocccd1sex.jpg', '/home/anhcd/Projects/dac_vietocr/img_ocr/ducanhcao/ducanhcaocccd1origin.jpg', '/home/anhcd/Projects/dac_vietocr/img_ocr/ducanhcao/ducanhcaocccd1r2.jpg', '/home/anhcd/Projects/dac_vietocr/img_ocr/ducanhcao/ducanhcaocccd1id.jpg']\n",
+      "img_name: ducanhcaocccd1nation\n",
+      "tensor([[[7.26099e+00, 4.05999e+00, 2.87396e+01,  ..., 1.99240e-02, 2.64919e-02, 2.19129e-01],\n",
+      "         [1.37559e+01, 5.06019e+00, 3.11044e+01,  ..., 3.57850e-02, 2.77619e-02, 1.63850e-01],\n",
+      "         [2.29905e+01, 4.46362e+00, 4.87838e+01,  ..., 5.33510e-02, 2.07194e-02, 7.84407e-02],\n",
+      "         ...,\n",
+      "         [5.50323e+02, 7.48115e+01, 1.05305e+02,  ..., 3.95563e-02, 4.42920e-02, 1.24317e-01],\n",
+      "         [5.80126e+02, 7.41523e+01, 1.00826e+02,  ..., 4.02013e-02, 5.62249e-02, 9.85802e-02],\n",
+      "         [6.13404e+02, 7.78276e+01, 1.52621e+02,  ..., 4.46064e-02, 4.85629e-02, 1.02114e-01]]], grad_fn=<CatBackward0>)\n",
+      "result yolo7 []\n",
+      "Elapsed time: 0.09395909309387207 seconds\n",
+      "img_name: ducanhcaocccd1birth\n",
+      "tensor([[[6.96512e+00, 4.01770e+00, 3.03306e+01,  ..., 2.01924e-02, 2.56553e-02, 1.28210e-01],\n",
+      "         [1.41638e+01, 5.12023e+00, 3.38066e+01,  ..., 4.23885e-02, 3.26655e-02, 9.99929e-02],\n",
+      "         [2.26646e+01, 4.49237e+00, 4.83757e+01,  ..., 7.18630e-02, 2.23456e-02, 8.54115e-02],\n",
+      "         ...,\n",
+      "         [5.46470e+02, 7.53209e+01, 1.09950e+02,  ..., 3.55830e-02, 3.50057e-02, 1.43581e-01],\n",
+      "         [5.77843e+02, 7.36600e+01, 1.26061e+02,  ..., 3.77752e-02, 4.51986e-02, 1.06225e-01],\n",
+      "         [6.11994e+02, 7.75922e+01, 1.71467e+02,  ..., 3.88749e-02, 4.10803e-02, 1.09043e-01]]], grad_fn=<CatBackward0>)\n",
+      "result yolo7 []\n",
+      "Elapsed time: 0.07357263565063477 seconds\n",
+      "img_name: ducanhcaocccd1name\n",
+      "tensor([[[7.85997e+00, 4.04146e+00, 3.11653e+01,  ..., 2.21473e-02, 2.85321e-02, 1.30392e-01],\n",
+      "         [1.46489e+01, 5.06272e+00, 3.40587e+01,  ..., 5.33185e-02, 3.22187e-02, 1.10082e-01],\n",
+      "         [2.28639e+01, 4.56742e+00, 4.99890e+01,  ..., 9.17780e-02, 2.37300e-02, 7.56081e-02],\n",
+      "         ...,\n",
+      "         [5.50199e+02, 7.26495e+01, 1.09896e+02,  ..., 3.20059e-02, 5.48642e-02, 1.52614e-01],\n",
+      "         [5.79119e+02, 7.11925e+01, 1.06547e+02,  ..., 3.43380e-02, 6.31083e-02, 1.27570e-01],\n",
+      "         [6.13034e+02, 7.54183e+01, 1.65852e+02,  ..., 4.05585e-02, 5.14021e-02, 1.19271e-01]]], grad_fn=<CatBackward0>)\n",
+      "result yolo7 []\n",
+      "Elapsed time: 0.0675969123840332 seconds\n",
+      "img_name: ducanhcaocccd1sex\n",
+      "tensor([[[6.75139e+00, 3.79737e+00, 3.04912e+01,  ..., 1.90046e-02, 2.42044e-02, 1.88708e-01],\n",
+      "         [1.37134e+01, 5.20134e+00, 3.29218e+01,  ..., 3.09879e-02, 2.90582e-02, 1.28494e-01],\n",
+      "         [2.22934e+01, 4.87867e+00, 4.81174e+01,  ..., 4.31133e-02, 2.05402e-02, 7.62491e-02],\n",
+      "         ...,\n",
+      "         [5.51570e+02, 7.44274e+01, 9.48155e+01,  ..., 3.55333e-02, 4.69442e-02, 1.29273e-01],\n",
+      "         [5.80613e+02, 7.39035e+01, 8.86110e+01,  ..., 4.03416e-02, 5.58332e-02, 9.57269e-02],\n",
+      "         [6.13693e+02, 7.79142e+01, 1.48807e+02,  ..., 4.39701e-02, 4.92099e-02, 9.85647e-02]]], grad_fn=<CatBackward0>)\n",
+      "result yolo7 []\n",
+      "Elapsed time: 0.06969332695007324 seconds\n",
+      "img_name: ducanhcaocccd1origin\n",
+      "tensor([[[6.30326e+00, 3.94146e+00, 3.29426e+01,  ..., 3.67895e-02, 3.45050e-02, 1.36935e-01],\n",
+      "         [1.30867e+01, 5.66837e+00, 3.39089e+01,  ..., 6.47475e-02, 4.40949e-02, 1.11277e-01],\n",
+      "         [2.04908e+01, 5.58759e+00, 4.72886e+01,  ..., 8.97226e-02, 2.74214e-02, 7.83768e-02],\n",
+      "         ...,\n",
+      "         [5.54586e+02, 7.71124e+01, 1.47577e+02,  ..., 4.88534e-02, 4.75350e-02, 1.28162e-01],\n",
+      "         [5.82739e+02, 7.57000e+01, 1.32784e+02,  ..., 4.18695e-02, 6.51118e-02, 1.01845e-01],\n",
+      "         [6.13369e+02, 7.69265e+01, 1.95680e+02,  ..., 4.00506e-02, 6.35686e-02, 9.50838e-02]]], grad_fn=<CatBackward0>)\n",
+      "result yolo7 []\n",
+      "Elapsed time: 0.07462000846862793 seconds\n",
+      "img_name: ducanhcaocccd1r2\n",
+      "tensor([[[6.54024e+00, 3.99407e+00, 3.07478e+01,  ..., 2.84999e-02, 3.44758e-02, 1.18740e-01],\n",
+      "         [1.38926e+01, 4.84244e+00, 3.47917e+01,  ..., 6.59324e-02, 3.85384e-02, 1.24240e-01],\n",
+      "         [2.16096e+01, 4.78552e+00, 4.92324e+01,  ..., 1.01309e-01, 2.57947e-02, 1.20259e-01],\n",
+      "         ...,\n",
+      "         [5.53538e+02, 7.74819e+01, 1.41711e+02,  ..., 4.88862e-02, 3.97473e-02, 1.29176e-01],\n",
+      "         [5.83266e+02, 7.85188e+01, 1.41294e+02,  ..., 4.43791e-02, 5.97726e-02, 9.62701e-02],\n",
+      "         [6.13904e+02, 8.01113e+01, 1.99567e+02,  ..., 4.24263e-02, 6.37722e-02, 9.40810e-02]]], grad_fn=<CatBackward0>)\n",
+      "result yolo7 []\n",
+      "Elapsed time: 0.0670614242553711 seconds\n",
+      "img_name: ducanhcaocccd1id\n",
+      "tensor([[[7.15865e+00, 3.90274e+00, 3.01145e+01,  ..., 1.87799e-02, 2.34303e-02, 1.72056e-01],\n",
+      "         [1.43305e+01, 4.87431e+00, 3.33370e+01,  ..., 3.62554e-02, 2.64340e-02, 1.28734e-01],\n",
+      "         [2.30727e+01, 4.47840e+00, 4.85699e+01,  ..., 5.64415e-02, 1.59234e-02, 9.82331e-02],\n",
+      "         ...,\n",
+      "         [5.54129e+02, 7.76490e+01, 9.91483e+01,  ..., 4.22094e-02, 4.26301e-02, 1.37826e-01],\n",
+      "         [5.81268e+02, 7.57913e+01, 9.21433e+01,  ..., 3.96665e-02, 5.87279e-02, 1.06169e-01],\n",
+      "         [6.14154e+02, 8.00202e+01, 1.56239e+02,  ..., 4.18954e-02, 6.09098e-02, 1.01468e-01]]], grad_fn=<CatBackward0>)\n",
+      "result yolo7 []\n",
+      "Elapsed time: 0.07022809982299805 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "import time \n",
+    "\n",
+    "model_text = load_model(\"/home/anhcd/Projects/dac_vietocr/model/crop_text.pt\")\n",
+    "dir_ocr = '/home/anhcd/Projects/dac_vietocr/img_ocr'\n",
+    "print(f'img_paths: {img_paths}')\n",
+    "for img_path in img_paths:\n",
+    "    img_name = img_path.split(\"/\")[-1].split(\".\")[0]\n",
+    "    print(f'img_name: {img_name}')\n",
+    "\n",
+    "    start_time = time.time()\n",
+    "    result, orig_img, plot_img = predict_text(img_path, model_text, None)\n",
+    "    end_time = time.time()\n",
+    "    # Print the elapsed time\n",
+    "    print(\"Elapsed time:\", end_time - start_time, \"seconds\")\n",
+    "\n",
+    "    # print(f'result: {result}')\n",
+    "    result = dictionary_text(result)\n",
+    "\n",
+    "    for key in result.keys():\n",
+    "        xmin, ymin, xmax, ymax = result[key]\n",
+    "\n",
+    "        bottom_right_point = (xmax, ymin)\n",
+    "        bottom_left_point = (xmin, ymin)\n",
+    "        top_left_point = (xmin, ymax)\n",
+    "        top_right_point = (xmax, ymax)\n",
+    "\n",
+    "        source_points = np.float32([bottom_left_point,bottom_right_point, top_right_point,top_left_point]) \n",
+    "        crop = perspective_transform_text(orig_img, source_points)\n",
+    "        cv2.imwrite(f\"{dir_ocr}/\" + img_name + key + \".jpg\", crop)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Main flow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fusing layers... \n",
+      "RepConv.fuse_repvgg_block\n",
+      "RepConv.fuse_repvgg_block\n",
+      "RepConv.fuse_repvgg_block\n",
+      "IDetect.fuse\n",
+      "Fusing layers... \n",
+      "RepConv.fuse_repvgg_block\n",
+      "RepConv.fuse_repvgg_block\n",
+      "RepConv.fuse_repvgg_block\n",
+      "IDetect.fuse\n",
+      "Please try again\n",
+      "result yolo7 []\n",
+      "result yolo7 []\n",
+      "result yolo7 []\n",
+      "result yolo7 []\n",
+      "result yolo7 []\n",
+      "result yolo7 []\n"
+     ]
+    }
+   ],
+   "source": [
+    "import glob\n",
+    "import os\n",
+    "import cv2\n",
+    "import numpy as np\n",
+    "import os\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    model = load_model(\"/home/anhcd/Projects/dac_ocr/model/cccd_4_conner.pt\")\n",
+    "    model_text = load_model(\"/home/anhcd/Projects/dac_ocr/model/crop_text.pt\")\n",
+    "\n",
+    "    img_paths = []\n",
+    "    dir = \"/home/anhcd/Projects/dac_ocr/img/\"\n",
+    "    dir_crop = \"/home/anhcd/Projects/dac_ocr/img_crop/\"\n",
+    "    dir_ocr = \"/home/anhcd/Projects/dac_ocr/img_ocr/\"\n",
+    "\n",
+    "    for file in os.listdir(dir):\n",
+    "        img_paths.append(os.path.join(dir, file))\n",
+    "\n",
+    "    for img_path in img_paths:\n",
+    "        img_name = img_path.split(\"/\")[-1].split(\".\")[0]\n",
+    "        \n",
+    "        crop = warp_identity_card(img_path, model)\n",
+    "        cv2.imwrite(dir_crop + img_name + \".jpg\", crop)\n",
+    "\n",
+    "    imgcrop_paths = []\n",
+    "    \n",
+    "    for file in os.listdir(dir_crop):\n",
+    "        imgcrop_paths.append(os.path.join(dir_crop, file))\n",
+    "\n",
+    "    for img_path in imgcrop_paths:\n",
+    "        img_name = img_path.split(\"/\")[-1].split(\".\")[0]\n",
+    "        result, orig_img, plot_img = predict_text(img_path, model_text, None)\n",
+    "        result = dictionary_text(result)\n",
+    "        if result == {}:\n",
+    "            continue\n",
+    "\n",
+    "        for key in result.keys():\n",
+    "            xmin, ymin, xmax, ymax = result[key]\n",
+    "\n",
+    "            bottom_right_point = (xmax, ymin)\n",
+    "            bottom_left_point = (xmin, ymin)\n",
+    "            top_left_point = (xmin, ymax)\n",
+    "            top_right_point = (xmax, ymax)\n",
+    "\n",
+    "            source_points = np.float32([bottom_left_point,bottom_right_point, top_right_point,top_left_point]) \n",
+    "            crop = perspective_transform_text(orig_img, source_points)\n",
+    "            cv2.imwrite(f\"{dir_ocr}/\" + img_name + key + \".jpg\", crop)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "dacocr",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/poc_note/test_vietocr.ipynb
+++ b/poc_note/test_vietocr.ipynb
--- a/weight/cnn.onnx
+++ b/weight/cnn.onnx
--- a/weight/decoder.onnx
+++ b/weight/decoder.onnx
--- a/weight/encoder.onnx
+++ b/weight/encoder.onnx