Safer usage of mkdir across the project

2026-04-20 22:13:43 +00:00 · 2025-06-17 07:09:33 -07:00 · 2025-06-17 07:09:33 -07:00 · 0d1597616f
commit 0d1597616f
parent 8689d7ecea
16 changed files with 240 additions and 206 deletions
--- a/extensions/Training_PRO/script.py
+++ b/extensions/Training_PRO/script.py
@ -109,12 +109,12 @@ def ui():
                            copy_from = gr.Dropdown(label='Copy parameters from', value='None', choices=get_available_loras_local(non_serialized_params['Lora_sortedByTime']), elem_classes=['slim-dropdown'])
                            create_refresh_button(copy_from, lambda: None, lambda: {'choices': get_available_loras_local(non_serialized_params['Lora_sortedByTime'])}, 'refresh-button')
                    with gr.Column():
-                        sort_byTime = gr.Checkbox(label='Sort list by Date', value=False, info='Sorts Loras by date created.', elem_classes=['no-background'])                        
+                        sort_byTime = gr.Checkbox(label='Sort list by Date', value=False, info='Sorts Loras by date created.', elem_classes=['no-background'])

                with gr.Row():
                    with gr.Column(scale=5):
                        lora_name = gr.Textbox(label='Name', info='The name of your new LoRA file')
-    
+
                    with gr.Column():
                        always_override = gr.Checkbox(label='Override Existing Files', value=False, info='If the name is the same, checking will replace the existing file, and unchecking will load and continue from it (the rank must be the same).', elem_classes=['no-background'])

@ -132,14 +132,14 @@ def ui():
                        epochs = gr.Number(label='Epochs', value=3, info='Number of times every entry in the dataset should be fed into training. So 1 means feed each item in once, 5 means feed it in five times, etc.')
                        learning_rate = gr.Textbox(label='Learning Rate', value='3e-4', info='In scientific notation. 3e-4 is a good starting base point. 1e-2 is extremely high, 1e-6 is extremely low.')
                        lr_scheduler_type = gr.Dropdown(label='LR Scheduler', value='linear', choices=['linear', 'constant', 'constant_with_warmup', 'cosine', 'cosine_with_restarts', 'polynomial', 'inverse_sqrt', 'FP_low_epoch_annealing', 'FP_half_time_annealing','FP_raise_fall_creative'], info='Learning rate scheduler - defines how the learning rate changes over time. Custom schedulers: FP_low_epoch_annealing, FP_half_time_annealing, FP_raise_fall_creative (see README)', elem_classes=['slim-dropdown'])
-                        
+
                with gr.Accordion(label='Checkpoints', open=True):
                    with gr.Row():
                        with gr.Column():
                            save_steps = gr.Number(label='Save every n steps', value=0, info='A checkpoint will be saved every n steps and at each Epoch boundary. (0 = OFF)')
-                        with gr.Column():    
-                            save_steps_under_loss = gr.Slider(label='Save at 10% Loss change', value=1.8, minimum=0.0, maximum=3.0, step=0.1, info="Saves checkpoints at (or bellow) this loss and then each time loss falls by at least 10% This works independently from 'Save every n steps'")    
-                    with gr.Row():        
+                        with gr.Column():
+                            save_steps_under_loss = gr.Slider(label='Save at 10% Loss change', value=1.8, minimum=0.0, maximum=3.0, step=0.1, info="Saves checkpoints at (or bellow) this loss and then each time loss falls by at least 10% This works independently from 'Save every n steps'")
+                    with gr.Row():
                        save_chackpoint_now = gr.Button('Queue Checkpoint Now')

                with gr.Accordion(label='Advanced Options', open=True):
@ -148,7 +148,7 @@ def ui():
                            warmup_steps = gr.Number(label='Warmup Steps', value=100, info='Number of max steps used for a linear warmup. Reduces early over-fitting by the first training blocks. Value has precedent over Warmup Ratio. Aligns to the closest multiple of graddient accumulation')
                            warmup_ratio = gr.Slider(label='Warmup Ratio', minimum=0.0, maximum=0.2, step=0.025, value=0.0, info='Ratio of total training steps that will be used for a linear warmup. It applies only if Warmup Step is 0.')
                            neft_noise_alpha = gr.Slider(label='NEFtune noise scale', minimum=0.0, maximum=15, step=1, value=0.0, info='Add noise to the training to improve generalization. [0 - OFF, Starting value to experiment: 5]')
-                            training_projection = gr.Radio(value = train_choices[4], label='LLaMA Target Projections', info='Change the targets (LORA is typically q-v)', choices=train_choices)    
+                            training_projection = gr.Radio(value = train_choices[4], label='LLaMA Target Projections', info='Change the targets (LORA is typically q-v)', choices=train_choices)
                            lora_dropout = gr.Slider(label='LoRA Dropout', minimum=0.0, maximum=1.0, step=0.025, value=0.05, info='Percentage probability for dropout of LoRA layers. This can help reduce overfitting. Most users should leave at default.')
                            optimizer = gr.Dropdown(label='Optimizer', value='adamw_torch', choices=['adamw_hf', 'adamw_torch', 'adamw_torch_fused', 'adamw_torch_xla', 'adamw_apex_fused', 'adafactor', 'adamw_bnb_8bit', 'adamw_anyprecision', 'sgd', 'adagrad'], info='Different optimizer implementation options, for advanced users. Effects of different options are not well documented yet.', elem_classes=['slim-dropdown'])

@ -157,10 +157,10 @@ def ui():
                            add_bos_token = gr.Checkbox(label='Add BOS token', value=True, info="Adds BOS token for each dataset item")
                            add_eos_token = gr.Checkbox(label='Add EOS token', value=False, info="Adds EOS token for each dataset item")
                            add_eos_token_type = gr.Dropdown(label='EOS placement (Text file)', choices=['Every Block', 'Hard Cut Blocks Only'], value='Every Block', info='', allow_custom_value = False)
-                            
+
                            higher_rank_limit = gr.Checkbox(label='Enable higher ranks', value=False, info='If checked, changes Rank/Alpha slider above to go much higher. This will not work without a datacenter-class GPU.')
                            report_to = gr.Radio(label="Save detailed logs with", value="None", choices=["None", "wandb", "tensorboard"], interactive=True)
-                # for future            
+                # for future
                #with gr.Accordion(label='Dynamic Scheduler', open = False):
                #    ds_min_epochs = gr.Number(label='Minimum Epochs', value='1', info='Minimum epochs that will be always performed before ramp down can be triggered')
                #    ds_max_epochs = gr.Number(label='Maximum Epochs (fallback)', value='50', info='Maximum Epochs before the training will bail out completely (should be a large number)')
@ -168,7 +168,7 @@ def ui():
                #    ds_loss_rolling_window = gr.Number(label='Loss rolling average', value='4', info='Calculate loss by averaging last x numbers to avoid jumps and noise')
                #    ds_epochs_to_ramp = gr.Slider(label='Ramp down ratio', minimum=0.0, maximum=2.0, step=0.1, value=1.00, info='How long the ramp down will last relative to ellapsed steps (before trigger)')
                #    gr.Markdown('These are settings for FP_dynamic_loss_trigger scheduler. The scheduler will do warm up, then hold constant untill a loss falls under Trigger Loss, then it will commence linear ramp down schedule and stop. The length of ramp down is set by Ramp down ratio where (ramp down steps) = ratio * (elapsed steps). (The time to completition shown will be very high untill ramp down is triggered.)')
-                        
+

            with gr.Column():
                with gr.Tab(label='Formatted Dataset'):
@ -217,7 +217,7 @@ def ui():
                            cutoff_len = gr.Slider(label='Chunk Length (Cutoff Length)', minimum=32, maximum=2048, value=256, step=32, info='The maximum length of a chunk (in tokens). Applies to both JSON dataset and text files. Higher values require much more VRAM.')
                with gr.Row():
                    with gr.Column():
-                        check_dataset_btn = gr.Button('Verify Dataset/Text File and suggest data entries')    
+                        check_dataset_btn = gr.Button('Verify Dataset/Text File and suggest data entries')
                        check_dataset_txt = gr.Textbox(label='Dataset info', value='')

                with gr.Row():
@ -227,8 +227,8 @@ def ui():
                with gr.Accordion(label="Graph", open=True):
                    with gr.Row():
                        # show_actions_button = False - we use old gradio
-                        plot_graph = gr.LinePlot(x="epoch", y="value", title="Loss Metrics", overlay_point=True, tooltip=["epoch", "value"], x_lim=[0, 1], y_lim=[0, 3.5], width=500, height=250) 
- 
+                        plot_graph = gr.LinePlot(x="epoch", y="value", title="Loss Metrics", overlay_point=True, tooltip=["epoch", "value"], x_lim=[0, 1], y_lim=[0, 3.5], width=500, height=250)
+
                output = gr.Markdown(value="Ready")

    with gr.Tab('Perplexity evaluation', elem_id='evaluate-tab'):
@ -267,7 +267,7 @@ def ui():

        return grad_accumulation_val

-    
+
    copy_from.change(partial(do_copy_params, all_params= all_params), copy_from, all_params).then(fix_old_version,[batch_size,micro_batch_size, grad_accumulation],grad_accumulation)
    start_button.click(do_train, all_params, [output,plot_graph])
    stop_button.click(do_interrupt, None, None, queue=False)
@ -306,8 +306,8 @@ def ui():
        if shared.tokenizer is None:
            yield "Tokenizer is not available. Please Load some Model first."
            return
-        
-        
+
+
        if raw_text_file not in ['None', '']:
            logger.info("Loading Text file...")
            fullpath = clean_path('user_data/training/datasets', f'{raw_text_file}')
@ -329,8 +329,8 @@ def ui():
                except:
                    yield f"{raw_text_file}.txt doesn't seem to exsist anymore... check your user_data/training/datasets folder"
                    return
-            
- 
+
+
            if min_chars<0:
                min_chars = 0

@ -343,11 +343,11 @@ def ui():
            total_blocks = len(text_chunks)
            result = f"Text: ({raw_text_file}.txt) has {total_blocks} blocks (Block Size {cutoff_len} tokens)"
            del text_chunks
-       
+
        else:
            if dataset in ['None', '']:
                yield "Select dataset or text file."
-                return 
+                return

            if format in ['None', '']:
                yield "Select format choice for dataset."
@ -382,8 +382,8 @@ def ui():

            logger.info("Loading JSON datasets...")
            data = load_dataset("json", data_files=clean_path('user_data/training/datasets', f'{dataset}.json'))
-            
-            data_keys = [] 
+
+            data_keys = []

            if data:
                if 'train' in data:  # Check if the 'train' split exists in the dataset
@ -400,11 +400,11 @@ def ui():
            #for options, data in format_data.items():
            #    format_keys = options.split(',')
            #    result += f"{format_keys}, "
-            #result = result.rstrip()    
-            #result = result.rstrip(',')  
+            #result = result.rstrip()
+            #result = result.rstrip(',')

        if total_blocks>0:
-            number_ofSteps = int(math.ceil(total_blocks / micro_batch_size) * epochs) 
+            number_ofSteps = int(math.ceil(total_blocks / micro_batch_size) * epochs)
            num_stepsPer_epoch = int(math.ceil(number_ofSteps/epochs))
            min_warm = math.ceil(100 / grad_accumulation)

@ -415,20 +415,20 @@ def ui():
            save_each_n_max = int(math.ceil(number_ofSteps/5))
            gradient_accumulation_max = int(total_blocks)//micro_batch_size

- 
+
            result += f"\n[Batch Size: {micro_batch_size}, Epochs: {epochs}, Gradient Accumulation: {grad_accumulation}]\n"
            result += f"Total number of steps: {number_ofSteps}\n"
            result += f"Steps per each Epoch: {num_stepsPer_epoch}\n"
            result += f"Suggestions:\n"
            result += f"Checkpoints: Save every {save_each_n_min} - {save_each_n_max} steps (Current: {int(save_steps)})\n"
            result += f"Warmup steps: {warmup_steps_suggest} (Current: {int(warmup_steps)})"
-            if gradient_accumulation_max < grad_accumulation: 
+            if gradient_accumulation_max < grad_accumulation:
                result += f"\n\nWARNING: Gradient Accumulation {grad_accumulation} is too high: It should be below {gradient_accumulation_max}"


        yield result
        return
-    
+
    check_dataset_btn.click(check_dataset, dataset_calc_params ,check_dataset_txt)

    # Evaluation events. For some reason, the interrupt event
@ -449,10 +449,10 @@ def ui():

    def reload_lora():
        return gr.Dropdown.update(choices=get_available_loras_local(non_serialized_params['Lora_sortedByTime']))
- 
+
    # nonserialized items

-    sort_byTime.change(lambda x: non_serialized_params.update({"Lora_sortedByTime": x}), sort_byTime, None).then(reload_lora,None,copy_from) 
+    sort_byTime.change(lambda x: non_serialized_params.update({"Lora_sortedByTime": x}), sort_byTime, None).then(reload_lora,None,copy_from)
    #debug_slicer.change(lambda x: non_serialized_params.update({"debug_slicer": x}), debug_slicer, None)

    def update_dataset():
@ -482,7 +482,7 @@ def do_copy_params(lora_name: str, all_params):
        else:
            params = {}
    else:
-        params = {}        
+        params = {}

    result = list()
    for i in range(0, len(PARAMETERS)):
@ -521,7 +521,8 @@ def backup_adapter(input_folder):

            # Create the new subfolder
            subfolder_path = Path(f"{input_folder}/{creation_date_str}")
-            subfolder_path.mkdir(parents=True, exist_ok=True)
+            if not subfolder_path.exists():
+                subfolder_path.mkdir(parents=True, exist_ok=True)

            # Check if the file already exists in the subfolder
            backup_adapter_file = Path(f"{input_folder}/{creation_date_str}/adapter_model.bin")
@ -607,7 +608,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
    shared.tokenizer.padding_side = "left"

    def encode(text, prepend_bos_token):
-       
+
        result = shared.tokenizer.encode(text, truncation=True, max_length=cutoff_len)
        # Check if the first two tokens are BOS
        if len(result) >= 2 and result[:2] == [shared.tokenizer.bos_token_id, shared.tokenizer.bos_token_id]:
@ -626,7 +627,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
                input_ids.append(shared.tokenizer.eos_token_id)

            input_ids = [shared.tokenizer.pad_token_id] * (cutoff_len - len(input_ids)) + input_ids
-            
+
            labels = [1] * len(input_ids)
        else:
            ind = prompt.index(train_only_after) + len(train_only_after)
@ -653,7 +654,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
        }

    train_template.clear()
-            
+
    #reset stuff
    print(f"*** LoRA: {lora_name} ***")
    non_serialized_params.update({"stop_at_loss": stop_at_loss})
@ -665,7 +666,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
    non_serialized_params.update({"checkpoint_offset": 0})
    non_serialized_params.update({"epoch_offset": 0})
    train_log_graph.clear()
-  
+
     # == Prep the dataset, format, etc ==
    if raw_text_file not in ['None', '']:
        train_template["template_type"] = "raw_text"
@ -685,8 +686,8 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
        else:
            with open(clean_path('user_data/training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file:
                raw_text = file.read().replace('\r', '')
-        
-        # FPHAM PRECISE SLICING        
+
+        # FPHAM PRECISE SLICING
        if min_chars<0:
            min_chars = 0

@ -703,7 +704,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch

        train_data = Dataset.from_list([tokenize(x, add_EOS_to_all, add_bos_token) for x in text_chunks])
        if add_EOS_to_all:
-            print(f"Added EOS to {len(text_chunks)} blocks") 
+            print(f"Added EOS to {len(text_chunks)} blocks")

        print(f"All Data Blocks: {len(text_chunks)}")

@ -745,7 +746,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
        data = load_dataset("json", data_files=clean_path('user_data/training/datasets', f'{dataset}.json'))
        train_data = data['train'].map(generate_and_tokenize_prompt, new_fingerprint='%030x' % random.randrange(16**30))

-        print(f"BOS: {add_bos_token} EOS: {add_eos_token}") 
+        print(f"BOS: {add_bos_token} EOS: {add_eos_token}")
        print(f"Data Blocks: {train_data.num_rows}")

        if eval_dataset == 'None':
@ -783,7 +784,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
            print(f"Method: {RED}QLORA{RESET}")
            prepare_model_for_kbit_training(shared.model)
        else:
-            print(f"Method: {RED}LoRA{RESET}")    
+            print(f"Method: {RED}LoRA{RESET}")

    # base model is now frozen and should not be reused for any other LoRA training than this one
    shared.model_dirty_from_training = True
@ -796,7 +797,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
    elif training_projection==train_choices[2]:
        model_to_lora_modules[model_id] = ["q_proj","k_proj", "v_proj"]
    elif training_projection==train_choices[3]:
-        model_to_lora_modules[model_id] = ["k_proj", "v_proj", "down_proj"]        
+        model_to_lora_modules[model_id] = ["k_proj", "v_proj", "down_proj"]
    else:
        model_to_lora_modules[model_id] = ["q_proj", "v_proj"]

@ -827,9 +828,9 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
            set_peft_model_state_dict(lora_model, state_dict_peft)

            print(f" + Continue Training on {RED}{lora_file_path}/adapter_model.bin{RESET}")
-            
+
            #load training_log.json if exist
-           
+
            if Path(f"{lora_file_path}/training_log.json").is_file():
                with open(f"{lora_file_path}/training_log.json", 'r') as json_file:
                    json_ilog = json.load(json_file)
@ -840,13 +841,13 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
                        if key=='epoch':
                            non_serialized_params.update({"epoch_offset": value})
                            print(f" + Epoch offset: {RED}{non_serialized_params['epoch_offset']}{RESET}")
-           
+

            if Path(f"{lora_file_path}/training_graph.json").is_file():
                try:
                    with open(f"{lora_file_path}/training_graph.json", 'r') as json_file:
                        train_log_graph = json.load(json_file)
-                        print(" + Training Graph loaded")   
+                        print(" + Training Graph loaded")
                except:
                    print(f"Can't read training_graph")

@ -876,72 +877,72 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
            else:
                current_loss = float(train_log.get('loss', 0.0))
                current_epoch_int = int(float(train_log.get('epoch', 0.0)))
-              
+
                force_save = False

                current_steps_offset = tracked.current_steps + non_serialized_params['checkpoint_offset']

-                folder_save = f"checkpoint-{current_steps_offset}"    
+                folder_save = f"checkpoint-{current_steps_offset}"

                # save if triggered by user
                if non_serialized_params['save_checkpoint_now']:
                    force_save = True
                    non_serialized_params.update({"save_checkpoint_now": False})
                    print(f"\033[1;31;1mSave Checkpoint manually trigerred.\033[0;37;0m")
-                    folder_save = f"checkpoint-{current_steps_offset}-user"  
+                    folder_save = f"checkpoint-{current_steps_offset}-user"

                patience = 3     # Set the number of consecutive steps for tracking stability
-                
+
                if gradient_accumulation_steps==1:
                    patience = 4

                min_steps = ssteps10

-                # Save each time the loss is below the threshold 
+                # Save each time the loss is below the threshold
                if current_loss < non_serialized_params['save_steps_under_loss'] and current_loss > 0 and state.global_step > min_steps:
                    current_stability = non_serialized_params['current_stability']
                    current_stability += 1
-                    non_serialized_params.update({"current_stability": current_stability}) 
+                    non_serialized_params.update({"current_stability": current_stability})

                    if current_stability >= patience:
                        current_stability = 0
-                        non_serialized_params.update({"current_stability": current_stability})     
+                        non_serialized_params.update({"current_stability": current_stability})
                        current_loss_dec = round(current_loss, 2)
                        loss_str = f"{current_loss_dec:.2f}"
                        loss_str = loss_str.replace('.', '_')
                        new_save = (current_loss_dec-0.1) + 0.01
                        non_serialized_params.update({"save_steps_under_loss": new_save})

-                        folder_save = f"checkpoint-{current_steps_offset}-loss-{loss_str}" 
-                        force_save = True   
+                        folder_save = f"checkpoint-{current_steps_offset}-loss-{loss_str}"
+                        force_save = True
+

-                   
                else:
                    # Reset stability if the loss goes above the threshold
-                    non_serialized_params.update({"current_stability": 0})   
+                    non_serialized_params.update({"current_stability": 0})

                # Save full epochs
-                if actual_save_steps>0 and current_epoch_int > non_serialized_params['save_epochs'] and state.global_step > min_steps: 
+                if actual_save_steps>0 and current_epoch_int > non_serialized_params['save_epochs'] and state.global_step > min_steps:
+

-                    
                    current_epoch_offset = current_epoch_int
-                    
+
                    if non_serialized_params['epoch_offset'] > 0:
                        current_epoch_offset = current_epoch_int + round(non_serialized_params['epoch_offset'], 2)
-                    
+
                    ep_off_str = f"{current_epoch_offset}"
                    ep_off_str = ep_off_str.replace('.', '_')
-                    folder_save = f"checkpoint-{current_steps_offset}-epoch-{ep_off_str}" 
+                    folder_save = f"checkpoint-{current_steps_offset}-epoch-{ep_off_str}"

                    non_serialized_params.update({"save_epochs": current_epoch_int})
                    force_save = True

                # save each actual_save_steps
                if state.global_step > 0 and actual_save_steps > 0 and state.global_step % actual_save_steps == 0:
-                    folder_save = f"checkpoint-{current_steps_offset}"  
-                    force_save = True   
+                    folder_save = f"checkpoint-{current_steps_offset}"
+                    force_save = True

-                if force_save:       
+                if force_save:
                    lora_model.save_pretrained(f"{lora_file_path}/{folder_save}/", safe_serialization = non_serialized_params['safe_serialization'])
                    print(f"\033[1;30;40mStep: {tracked.current_steps:6} \033[0;37;0m Saved: [{folder_save}]")
                    # Save log
@ -950,7 +951,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
                    # == Save training prompt ==
                    with open(f"{lora_file_path}/{folder_save}/training_prompt.json", 'w', encoding='utf-8') as file:
                        json.dump(train_template, file, indent=2)
-                
+

        def on_substep_end(self, args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs):
            tracked.current_steps += 1
@ -975,7 +976,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
                print(f"\033[1;30;40mStep: {tracked.current_steps:6} [+{non_serialized_params['checkpoint_offset']}] \033[0;37;0m", end='')
            else:
                print(f"\033[1;30;40mStep: {tracked.current_steps:6} \033[0;37;0m", end='')
-            
+
            graphentry = {
                'current_steps': int(train_log.get('current_steps_adjusted',0)),
                'loss': float(train_log.get('loss', 0.0)),
@ -986,7 +987,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
            cur_loss = float(train_log.get('loss', 0.0))
            cur_lr = float(train_log.get('learning_rate', 0.0))
            cur_epoch = float(train_log.get('epoch', 0.0))
-            
+
            if len(statistics['loss']) == 1:
                first_epoch = statistics['loss'][0]['epoch']
                first_value = statistics['loss'][0]['value']
@ -1013,7 +1014,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch

    # FPHAM SAMPLE REQ Transformers error handling
    gradient_accumulation_max = int(train_data.num_rows)//micro_batch_size
-    
+
    if gradient_accumulation_max < gradient_accumulation_steps:
        print(f"{RED}WARNING:{RESET} Current gradient accumulation is {RED}too high{RESET} for the amount of training data.")
        print(f"Gradient accumulation: {gradient_accumulation_steps} should be less than: {gradient_accumulation_max}. {RED}This could crash Accelerate/Transformers{RESET}")
@ -1041,9 +1042,9 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
    elif lr_scheduler_type =='FP_raise_fall_creative':
        custom_scheduller = True
        lr_scheduler_type_arg = 'constant_with_warmup'
-    
+
    #gradient_checkpointing=True
-    
+
    args=transformers.TrainingArguments(
            report_to=report_to if report_to != "None" else None,
            per_device_train_batch_size=micro_batch_size,
@ -1095,7 +1096,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
            data_collator=transformers.DataCollatorForLanguageModeling(shared.tokenizer, mlm=False),
            callbacks=list([Callbacks()])
        )
-    
+
    # END OF FPHAM CUSTOM SCHEDULER

    lora_model.config.use_cache = False
@ -1141,7 +1142,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
    if stop_at_loss > 0:
        print(f"Monitoring loss {RED}(Auto-Stop at: {stop_at_loss}){RESET}")

-    
+

    if WANT_INTERRUPT:
        yield "Interrupted before start.", zero_pd
@ -1157,7 +1158,9 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
                decoded_entries.append({"value": decoded_text})

            # Write the log file
-            Path('user_data/logs').mkdir(exist_ok=True)
+            if not Path('user_data/logs').exists():
+                Path('user_data/logs').mkdir(exist_ok=True)
+
            with open(Path('user_data/logs/train_dataset_sample.json'), 'w') as json_file:
                json.dump(decoded_entries, json_file, indent=4)

@ -1191,7 +1194,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
        else:
            max_value = 3.5
            last_epoch = 0
-            first_epoch = 0           
+            first_epoch = 0

        if WANT_INTERRUPT:

@ -1210,7 +1213,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
            time_elapsed = time.perf_counter() - start_time
            lastloss = float(train_log.get('loss', 0.0))

-            non_serialized_params.update({"training_loop": True})               
+            non_serialized_params.update({"training_loop": True})

            if lastloss > 0:
                lastloss_str = f", ... Current Loss: `{lastloss:.2f}`"
@ -1232,7 +1235,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
            if stop_at_loss != non_serialized_params['stop_at_loss']:
                stop_at_loss = non_serialized_params['stop_at_loss']
                print(f"Stop at loss changed {RED}(Auto-Stop at: {stop_at_loss}){RESET}")
-            
+
            losses = gr.LinePlot.update(
 				value = pd.DataFrame(statistics['loss']),
                x="epoch", y="value",
@ -1240,7 +1243,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
                overlay_point=True, tooltip=["epoch", "value"],
 				x_lim=[first_epoch,last_epoch], y_lim=[0,max_value],
                width=500, height=250 )
-				
+

            yield f"Running... **{tracked.current_steps}** / **{tracked.max_steps}** ... {timer_info}, {format_time(time_elapsed)} / {format_time(total_time_estimate)} ... {format_time(total_time_estimate - time_elapsed)} remaining {lastloss_str}", losses

@ -1256,7 +1259,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
    else:
        max_value = 3.5
        last_epoch = 0
-        first_epoch = 0 
+        first_epoch = 0

    return_pd = gr.LinePlot.update(
        value = pd.DataFrame(statistics['loss']),