dreambooth training guide

Using dreambooth A111

Download all anime Images for dataset | api.rule, dan, gel - [Script]

download software: https://github.com/Bionus/imgbrd-grabber > Tools > Options > Save > Separatedlog files > separate log file:
Name: %md5%.%ext% - Suffix: .txt - Text file content: %all:,excludenamespace=general,unsafe,separator=^, %

Search: ~tag1 -tag2 (Add~ remove- or just \<space>) [api.rule, dan, gel]

Fix1 - [Script]

Once download use this batch next to dl folder, here "anime" to fix extra extension + move video to a folder, (same can be done with Image)

1
2
3
4
@echo off&pushd "%CD%\anime"&setlocal EnableDelayedExpansion
for /f "delims=" %%F in ('dir /b *.jpg.txt *.png.txt *.mp4.txt *.gif.txt *.webm.txt *.swf.txt') do (set "file=%%~nF"&set "file=!file:.jpg=!"&set "file=!file:.png=!"&set "file=!file:.mp4=!"&set "file=!file:.gif=!"&set "file=!file:.webm=!"&set "file=!file:.swf=!"&move "%%F" "!file!%%~xF")
if not exist "%CD%\video\" mkdir "%CD%\video\"&for /f "delims=" %%F in ('dir /b *.mp4 *.webm *.swf *.gif') do (set "file=%%~nF"&set "extension=%%~xF"&move "%%F" "video\!file!!extension!"&move "!file!.txt" "video")
popd&pause

Fix2 - [Script]

This will remove all .txt from folder anime512x that does not have a corresponding/secondary file (color correction)

@echo off&set delFiles=0&pushd "%CD%\anime512x"&setlocal EnableDelayedExpansion&for %%a in (*.txt) do (set count=0&for %%b in ("%%~na.*") do (set /a count+=1&echo �[90m!count! %%b�[0m))&if !count!==1 echo %%a �[1mDelete�[0m&set /a delFiles+=1&del /f /q "%%a"
echo    �[1m%delFiles%�[0m Files deleted&popd&pause

Fix3: if add more clip caption, remove duplicated words. - [Script]

This will clean and remove duplicated words separated by commas in all .txt from folder anime512x (limited to 1024 character)

@echo off&pushd "%CD%\anime512x"&setlocal enableextensions enabledelayedexpansion
for %%f in (*.txt) do ( set /p line=<%%f&CALL :stringClean line &echo line: !line! &echo -----------  &Set numCounter=0&set uniqueStr=
    for %%b in ("!line:,=" "!") do ( set /A numCounter+=1 &Set "arrayString[!numCounter!]=%%~b" &Set DuplicatedString=false&Set StringsCounter=0
        for /L %%z in (1,1,!numCounter!) do ( echo CURRENTstr!numCounter!: "%%~b" AND   "!arrayString[%%z]!" SO StringsCount: !StringsCounter! uniqueStr: !uniqueStr!
        )&if "!arrayString[%%z]!"=="%%~b" set /A StringsCounter+=1 &if !StringsCounter! GEQ 2 (Set DuplicatedString=true)
    )&IF !DuplicatedString!==false (Set uniqueStr=!uniqueStr!,%%~b) ELSE (echo string!numCounter!DuplicatedTrue: %%~b *!StringsCounter!)
CALL :stringClean uniqueStr &echo %%f FinalString:!uniqueStr! &echo|set /p="!uniqueStr!">"%%f"
)
popd&pause&EXIT /B
:stringClean
set "%~1=!%~1: ,=,!"&set "%~1=!%~1:,,=,!"&set "%~1=!%~1:, ,=,!"&set "%~1=!%~1:    = !"&set "%~1=!%~1:   = !"&set "%~1=!%~1:  = !"
if "!%~1:~0,1!" == "," (set "%~1=!%~1:~1!")
if "!%~1:~-1!" == " " (set "%~1=!%~1:~0,-1%!")
if "!%~1:~-1!" == "," (set "%~1=!%~1:~0,-1%!")
if "!%~1:~-1!" == " " (set "%~1=!%~1:~0,-1%!")
if "!%~1:~0,1!" == " " (set "%~1=!%~1:~1!")
EXIT /B 0

Next?

in webui download extension sd_dreambooth_extension+sd_smartprocess+training-picker then apply and restart.

Change Setting training-picker to video path or move videos, reload if necessary
then in Tab Training Picker extract keyframes of each video and remove any unwanted Images.

Use Tab Smart Preprocess (can zoom on image) or Train>Preprocess image to crop and Generate Captions .txt. Example: set path to directory &

  • Crop Images + Generate Captions + Add DeepDanbooru Tags to Caption

or manually crop with Training Picker or birme.net or Edit in Photo>save this frame+crop.
I'll reduced Duplicated with Visual Similarity Duplicate Image Finder File > New Project (sel dir > OK) > StartScan(modify parameter if need it) and wait it's finish > Autocheck & Delete > Autocheck image with: Smaller files size.. (Ajust Checkbox for remove image) > Deleted Checked Files > Perform.

Counter tags to text - [Script]

(Optional) create allTags.txt to count and order all tags from all .txt inside the folderName, usage py CombineAllTags.py folderName | CombineAllTag.py:

import os
from collections import defaultdict
def combine_all_tags(folder):
    tag_count = defaultdict(int) # Initialize a defaultdict to store the tag count
    for file_name in os.listdir(folder): # Loop through every file in the folder
        if not file_name.endswith('.txt'): continue # Skip files that don't end with '.txt'
        with open(os.path.join(folder, file_name)) as f: # Open the file and read the tags into a list
            tags = f.read().strip().split(',')
        for tag in tags: # Increment the count for each tag in the list
            tag_count[tag.strip()] += 1
    with open('allTags.txt', 'w') as f: # Write the tag count to 'allTags.txt'
        for tag, count in sorted(tag_count.items(), key=lambda x: x[1], reverse=True):
            f.write(f'{tag},{count}\n')
if __name__ == '__main__': # Main function to parse the folder name from the command line
    import sys
    if len(sys.argv) != 2:
        print('Usage: py CombineAllTags.py <folderName>')
        sys.exit(1)
    folder = sys.argv[1]
    combine_all_tags(folder)

(Optional) convert any metadata "parameters" > .txt and vice versa. - [Script]

import sys, os, time; from PIL import Image, PngImagePlugin
def modify_parameters(file_path):
    img = Image.open(file_path); txt_file_path = os.path.splitext(file_path)[0]+'.txt'
    if not os.path.exists(txt_file_path):
        with open(txt_file_path, 'w') as f:
            f.write(img.text['parameters']); print(f"{txt_file_path} "+"\033[92m"+"created"+"\033[0m")
    else:
        with open(txt_file_path, 'r') as f: content = f.read()
        png_txt = PngImagePlugin.PngInfo(); png_txt.add_text("parameters", content)
        img.save(file_path, pnginfo=png_txt); print(f"{txt_file_path} read and {file_path} modified")
        os.remove(txt_file_path); print(f"{txt_file_path} "+"\033[91m"+"Deleted"+"\033[0m")
if __name__ == "__main__":
    if len(sys.argv) != 2: print('Usage: python ModifyParameters.py <nameOfTheFileAndExtension>\nNote: This script read&parse the metadata "parameters" from the argument file to the <fileName>.txt and vice versa. If does exist &delete the txt'); sys.exit(1)
    modify_parameters(sys.argv[1])

-

Texture Training

Download all texture and add caption to image, quickly with script

Scrap Texture, keeping dataset | polyhaven - [Script]

If you have install Python just run the script: Python polyhavenScraper.py:

import os, requests
from tqdm import tqdm

api_url, out_dir = "https://api.polyhaven.com/assets", "polyhaven_textures"
data = requests.get(api_url).json()
os.makedirs(out_dir, exist_ok=True)

for asset in tqdm(data, desc='DL assets', unit='asset', mininterval=0.1, dynamic_ncols=True):
    if data[asset]['type'] == 1:  # 0, 1, 2: HDRI, Texture, 3D Models # note the first {asset} is only right url format for 2/3.
        url = f"https://dl.polyhaven.org/file/ph-assets/Textures/png/2k/{asset}/{asset}_diff_2k.png"
        with requests.get(url, stream=True) as r:
            if r.status_code == 200:
                total_size = int(r.headers.get('content-length', 0))
                with tqdm(total=total_size, unit='iB', unit_scale=True, desc=url, leave=False) as pb:
                    with open(os.path.join(out_dir, f"{asset}_diff_2k.png"), 'wb') as f:
                        for chunk in r.iter_content(1024):
                            chunk and f.write(chunk) and pb.update(len(chunk))
            else:
                print(f"Error downloading {url}, skipping.")

Add Text CLIP Caption to each Images - [Extension]

Install clip-interrogator-ext Extension AUTOMATIC1111 let use more models for Caption.
Interrogator>Batch>Prompt Mode>Best, Enter correct Images folder>.\path\to\polyhaven_textures, and Choose a CLIP Model>Go!
This will add text file inside polyhaven_textures to describe each images.

Best CLIP model I found for texture in order

  • xlm roberta large ViT-H-14 frozen laion5b s13b b90k Top1
  • VIT bigG 14 laion2b s39b b160k
  • coca_ViT-L-14/mscoco_finetuned_laion2b_s13b_b90k; convnext_large_d_320/laion2b_s29b_b131k_ft_soup.

Result

Caption end wrong in most cases...

Download all texture and name it | [Manually]

Download Images and create for each one "<ImageName>.txt" in same directory. (log total tags using my script)

Dreambooth config settings

Batch Size: 1, Set Gradients to None When Zeroing: ✓, Gradient Checkpointing: ✓, Lora UNET Learning Rate: 0.0001, Lora Text Encoder Learning Rate: 0.00002, Learning Rate Scheduler: constant, Learning Rate Scheduler: 1, Constant/Linear Starting Factor: 1, Scale Position: 1, Max Resolution: 512; Optimizer: 8bit Adamw, Mixed Precision: fp16, Memory Attention: xformers, Cache Latents: ✓, Train UNET: ✓ Step Ratio of Text Encoder Training: 0, Freeze CLIP Normalization Layers: no, Instance Prompt: diffuse texture, [filewords] Deterministic: ✓

FixText

Script that generate and Check Letter,

DatasetLetterMaker.py:

# rough letter OCR synthetic dataset augmented

import os, string, random, re, math, wordninja, unicodedata, easyocr, webcolors, nltk, hashlib
import matplotlib.font_manager as fm
from PIL import Image, ImageDraw, ImageFont
from nltk.corpus import brown
from nltk.probability import FreqDist
from torchvision.transforms import ColorJitter

# Define the number range
START_NUMBER = 1
END_NUMBER = 26

# Create a directory for the images and text files
os.makedirs('dataset', exist_ok=True)

# Define the data augmentation transform
COLOR_JITTER = ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1)

def sanitize_filename(text):
    return hashlib.sha256(text.encode()).hexdigest() # Hash the text using SHA256

def calculate_font_size(draw, text, image_size):
    font_name = random.choice(fm.findSystemFonts())  # Use matplotlib to find a random font file
    font_size = 1
    font = ImageFont.truetype(font_name, font_size)
    text_size = font.getbbox(text)[2:4]
    while text_size[0] < image_size[0] and text_size[1] < image_size[1]:
        font_size += 1
        font = ImageFont.truetype(font_name, font_size)
        text_size = font.getbbox(text)[2:4]
    return font_size - 1

def place_text_randomly(img, draw, text, bg_color, image_size):
    while True:
        font_name = random.choice(fm.findSystemFonts())  # Define a random font_name here
        font_type = os.path.basename(font_name).rsplit('.', 1)[0]  # Get the font type from the font name
        font = ImageFont.truetype(font_name, 10)  # Create a test font object

        if all(font.getbbox(c) != (0, 0, 0, 0) for c in text) # Check if the font supports all characters in the text
            break

    text_color = tuple([255 - c for c in bg_color])  # Define text_color here

    angle = random.uniform(-30, 30) # Choose a random angle for the text

    # Randomly choose a size for the text that fits within the image when rotated
    min_font_size = 20
    max_font_size = calculate_font_size(draw, text, image_size)
    if max_font_size < min_font_size:
        return False, min_font_size  # Return False and the minimum font size

    font_size = random.randint(min_font_size, max_font_size)
    font = ImageFont.truetype(font_name, font_size)  # Define a random font

    text_width, text_height = draw.textsize(text, font=font)

    # Choose a random position for the text that fits within the image when rotated
    max_x_position = image_size[0] - max(text_width * abs(math.cos(angle)), text_height * abs(math.sin(angle)))
    max_y_position = image_size[1] - max(text_height * abs(math.cos(angle)), text_width * abs(math.sin(angle)))

    if max_x_position < 0 or max_y_position < 0:
        return False, min_font_size, font_type  # Return False and the minimum font size and font_type

    position = (random.randint(0, int(max_x_position)), 
                random.randint(0, int(max_y_position)))

    # Create a new image for the text and rotate it
    text_img = Image.new('RGBA', (text_width, text_height), color=(0,0,0,0))  # Change 'RGB' to 'RGBA' and bg_color to (0,0,0,0)
    text_draw = ImageDraw.Draw(text_img)
    text_draw.text((0, 0), text, fill=text_color + (255,), font=font)  # Add alpha channel to fill color

    rotated_text_img = text_img.rotate(angle, expand=1)

    # Paste the rotated text onto the original image at the specified position.
    img.alpha_composite(rotated_text_img.rotate(angle), position)  # Use alpha_composite instead of paste

    return True, font_size, font_type  # Modify this line to return font_type

def is_text_readable(image_path, text):
    reader = easyocr.Reader(['en'])  # specify the language(s)
    result = reader.readtext(image_path)

    # Check if the result is empty or below threshold confidence level of OCR.
    if not result or any(res[2] < 0.5 for res in result):  
        return False

    # Check if the recognized text matches the input text
    recognized_text = ' '.join([res[1] for res in result])
    if recognized_text != text:
        return False

    return True


def has_two_colors(image_path):
    img = Image.open(image_path)
    colors = img.getcolors()

    # Check if there are at least two colors in the image.
    if len(colors) < 2:
        return False

    return True

def closest_color(requested_color):
    min_colors = {}
    for key, name in webcolors.CSS3_NAMES_TO_HEX.items():
        r_c, g_c, b_c = webcolors.hex_to_rgb(name)
        rd = (r_c - requested_color[0]) ** 2
        gd = (g_c - requested_color[1]) ** 2
        bd = (b_c - requested_color[2]) ** 2
        min_colors[(rd + gd + bd)] = key

    closest_color_name = min_colors[min(min_colors.keys())]
    closest_color_name = ' '.join(wordninja.split(closest_color_name))

    return closest_color_name

def rgb_to_name(rgb):
    try:
        color_name = webcolors.rgb_to_name(rgb, spec='css3')
    except ValueError:
        color_name = closest_color(rgb)

    color_name = ' '.join(wordninja.split(color_name))

    return color_name

def describe_font_size(font_size):
    if font_size < 30:
        return "very small"
    elif font_size < 40:
        return "small"
    elif font_size < 50:
        return "medium"
    elif font_size < 60:
        return "large"
    else:
        return "very large"

def generate_image_and_text(text):
    sanitized_text = sanitize_filename(text)
    img_path = f'dataset/{sanitized_text}.png'

    for _ in range(3):  # Limit the number of attempts to 3.
        # Create a new image with random background color.
        bg_color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
        img = Image.new('RGBA', (1024, 1024), color=bg_color + (255,))  # Change 'RGB' to 'RGBA' and add alpha channel to bg_color
        d = ImageDraw.Draw(img)

        # Place the text at a random position with a random font.
        success, font_size, font_type = place_text_randomly(img, d, text, bg_color, img.size)  # Modify this line to return font_type

        if len(text) == 1 and text.isalnum(): # If the text is a single letter or number, do not skip it
            success = True

        if not success:  
            continue

        COLOR_JITTER(img).convert('RGB').save(img_path, 'PNG')  # Convert back to 'RGB' before saving

        # If the image has at least two colors and the text is readable, break the loop and write the text to a file.
        if has_two_colors(img_path) and is_text_readable(img_path, text):  # Add text as an argument here
            text_color_name = rgb_to_name(tuple([255 - c for c in bg_color]))
            bg_color_name = rgb_to_name(bg_color)
            size_description = describe_font_size(font_size)
            with open(f'dataset/{sanitized_text}.txt', 'w') as f:
                f.write(f'The word "{text}" is written in {text_color_name} letters on a {bg_color_name} background. The {font_type} font size is {size_description}.')  # Modify this line to include font_type
            break
    else:
        # If after 3 attempts the image is still not recognized and it exists, delete the image.
        if os.path.exists(img_path):
            os.remove(img_path)


def main():
    # Download the necessary resources from nltk
    nltk.download('brown')

    # Get words from the Brown corpus and create a frequency distribution of words
    fdist = FreqDist(word.lower() for word in brown.words())

    # Get the most common words and limit to the most commonly used 200+ words
    words = [word for word, frequency in fdist.most_common(200)]

    # Add UTF-8 characters with numbers
    utf8_chars = [chr(i) for i in range(32, 127)] + [chr(i) for i in range(161, 172)] + [chr(i) for i in range(174, 256)]
    utf8_chars = [c for c in utf8_chars if unicodedata.category(c)[0] != 'C']

    # Add numbers and single letters to your texts
    numbers = [str(i) for i in range(START_NUMBER, END_NUMBER+1)]
    letters = list(string.ascii_lowercase) + list(string.ascii_uppercase)
    texts = words + utf8_chars + numbers + letters

    # Generate images and text files for common English words.
    for word in words:
        generate_image_and_text(word)

    # Generate images and text files for UTF-8 characters.
    for char in utf8_chars:
        generate_image_and_text(char)

if __name__ == "__main__":
    main()

requirements.txt:

1
2
3
4
5
wordninja
easyocr
webcolors
nltk
matplotlib

Misc

Miscellaneous

Tips

Adding Full A model to B

(Optional) Combine Model: A pix2pix/inpaint/ownModel, B Protogen, C v1-5-Pruned, M 1


Edit
Pub: 27 Nov 2022 09:27 UTC
Edit: 21 Jan 2024 21:37 UTC
Views: 1221