simplest-bitcoin-book/gemini_translation_script.py at main · Keysa21/simplest-bitcoin-book · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
import os
import shutil
import argparse
import re
from google import genai
from google.genai import types
from pathlib import Path

# Configure the API key
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
DEFAULT_SOURCE_LANG = "english"
# Recommended to use a newer model that is good with instruction following.
# Check https://ai.google.dev/gemini-api/docs/models/gemini for available models.
DEFAULT_MODEL_NAME = "gemini-2.0-flash"

# Rough estimate of tokens per character (conservative estimate for most languages)
CHARS_PER_TOKEN = 4
MAX_CHUNK_TOKENS = 6000  # Leave some buffer below the 8192 limit
MAX_CHUNK_CHARS = MAX_CHUNK_TOKENS * CHARS_PER_TOKEN

def split_markdown_into_chapters(content: str) -> list[str]:
    """
    Splits markdown content into chapters based on heading patterns.
    Returns a list of chapter strings, each starting with a heading.
    """
    # Pattern to match chapter headings (# Chapter N, ## Chapter N, etc.)
    chapter_pattern = re.compile(r'^(#+\s+(?:Chapter|Ch\.?)\s+\d+.*?)$', re.MULTILINE | re.IGNORECASE)

    # Find all chapter starts
    matches = list(chapter_pattern.finditer(content))

    if not matches:
        # No chapters found, try to split on any major heading
        heading_pattern = re.compile(r'^(#{1,2}\s+.+?)$', re.MULTILINE)
        matches = list(heading_pattern.finditer(content))

        if not matches:
            # No major headings, return whole content if small enough
            return [content] if len(content) <= MAX_CHUNK_CHARS else split_by_size(content)

    chapters = []
    for i, match in enumerate(matches):
        start = match.start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(content)
        chapter = content[start:end].strip()

        # If chapter is still too large, split it further
        if len(chapter) > MAX_CHUNK_CHARS:
            chapters.extend(split_by_size(chapter))
        else:
            chapters.append(chapter)

    # Handle content before first chapter
    if matches and matches[0].start() > 0:
        preamble = content[:matches[0].start()].strip()
        if preamble:
            if len(preamble) > MAX_CHUNK_CHARS:
                chapters = split_by_size(preamble) + chapters
            else:
                chapters.insert(0, preamble)

    return chapters

def split_by_size(content: str) -> list[str]:
    """
    Splits content by approximate size, trying to break at paragraph boundaries.
    """
    if len(content) <= MAX_CHUNK_CHARS:
        return [content]

    chunks = []
    paragraphs = content.split('\n\n')
    current_chunk = ""

    for paragraph in paragraphs:
        # If adding this paragraph would exceed limit, start new chunk
        if current_chunk and len(current_chunk) + len(paragraph) + 2 > MAX_CHUNK_CHARS:
            chunks.append(current_chunk.strip())
            current_chunk = paragraph
        else:
            if current_chunk:
                current_chunk += '\n\n' + paragraph
            else:
                current_chunk = paragraph

        # If even a single paragraph is too large, split it by sentences
        if len(current_chunk) > MAX_CHUNK_CHARS:
            sentences = re.split(r'(?<=[.!?])\s+', current_chunk)
            chunk_part = ""
            for sentence in sentences:
                if chunk_part and len(chunk_part) + len(sentence) + 1 > MAX_CHUNK_CHARS:
                    chunks.append(chunk_part.strip())
                    chunk_part = sentence
                else:
                    if chunk_part:
                        chunk_part += ' ' + sentence
                    else:
                        chunk_part = sentence
            current_chunk = chunk_part

    if current_chunk.strip():
        chunks.append(current_chunk.strip())

    return chunks

def translate_text(client: genai.Client, model_name: str, text_content: str, target_language: str, source_language: str, safety_settings: list) -> str | None:
    """
    Translates the given text content using the Gemini API via the client.

    Args:
        client: The configured genai.Client instance.
        model_name: The name of the model to use (e.g., "gemini-2.0-flash-001").
        text_content: The text to translate.
        target_language: The language to translate to.
        source_language: The language to translate from.
        safety_settings: Safety settings for the generation.

    Returns:
        The translated text, or a fallback string with error/block information.
    """
    system_instruction_prompt = f"""You are an expert Markdown translator. Your primary task is to translate Markdown text from {source_language} to {target_language}.

IMPORTANT RULES:
1.  Preserve the original Markdown formatting EXACTLY. This includes, but is not limited to:
    *   Headings (e.g., #, ##, ###)
    *   Lists (ordered and unordered, nested lists)
    *   Bold (e.g., **text** or __text__)
    *   Italics (e.g., *text* or _text_)
    *   Strikethrough (e.g., ~~text~~)
    *   Code blocks (e.g., ```python ... ``` or indented code blocks)
    *   Inline code (e.g., `code`)
    *   Links (e.g., [text](url))
    *   Images (e.g., ![alt text](image_url))
    *   Tables
    *   Blockquotes (e.g., > quote)
    *   Horizontal rules (e.g., --- or ***)
    *   HTML tags embedded in Markdown (e.g., <details>, <summary>, <div>) - translate content within these tags if it's text, but leave tags themselves.
2.  Only translate the textual content. Do NOT alter Markdown syntax elements, URLs, image paths, or code within code blocks (except for comments within code).
3.  If you encounter code blocks:
    *   Translate any comments (e.g., # comment in Python, // comment in JS) within the code block into {target_language}.
    *   Leave the code itself UNCHANGED.
4.  If you encounter HTML tags, translate the text content within these tags, but leave the HTML tags themselves and their attributes unchanged.
5.  Translate the text accurately and naturally into {target_language}.
6.  Ensure that relative paths in links and images (e.g., `../images/foo.png` or `./another-doc.md`) are preserved exactly as they are.
7.  Do not add any introductory or concluding phrases like "Here is the translated text:". Only output the translated Markdown.
8.  Pay special attention to frontmatter (e.g., YAML block at the start of the file, enclosed in ---). Translate the values in the frontmatter, but keep the keys and structure intact. For example:
    ---
    title: My English Title
    author: John Doe
    ---
    should become (for French translation):
    ---
    title: Mon Titre Français
    author: John Doe
    ---
When you are given the Markdown text, provide only the translated version of it.
"""

    try:
        response = client.models.generate_content(
            model=model_name,
            contents=text_content, # Actual markdown to translate
            config=types.GenerateContentConfig(
                system_instruction=system_instruction_prompt,
                safety_settings=safety_settings,
                max_output_tokens=8192,  # Maximum for Gemini 2.0 Flash
                # Add other config like temperature here if needed
            )
        )
        if response.candidates and response.candidates[0].content:
            return response.text
        else:
            # Handling cases where the response might be blocked or empty
            print(f"Warning: Received no valid content in response for a chunk. Blocked: {response.prompt_feedback.block_reason if response.prompt_feedback else 'N/A'}")
            if response.prompt_feedback and response.prompt_feedback.block_reason:
                print(f"Block reason details: {response.prompt_feedback.block_reason_message}")
            return f"[[TranslationBlocked: {response.prompt_feedback.block_reason if response.prompt_feedback else 'Unknown reason'}]]\n\n{text_content}"

    except Exception as e:
        print(f"Error during translation: {e}")
        # Fallback: return original content with an error marker
        return f"[[TranslationError: {e}]]\n\n{text_content}"
    return None


def main():
    if not GEMINI_API_KEY:
        print("Error: GEMINI_API_KEY environment variable not set.")
        print("Please set it before running the script: export GEMINI_API_KEY='your_api_key'")
        return

    parser = argparse.ArgumentParser(
        description="Translates Markdown files in a directory structure using the Gemini API."
    )
    parser.add_argument(
        "source_dir",
        type=str,
        help="The source directory containing .md files to translate (e.g., 'english').",
    )
    parser.add_argument(
        "target_language",
        type=str,
        help="The target language for translation (e.g., 'french', 'spanish', 'luganda'). This will also be the name of the output directory.",
    )
    parser.add_argument(
        "--source_language",
        type=str,
        default=DEFAULT_SOURCE_LANG,
        help=f"The source language of the content (default: {DEFAULT_SOURCE_LANG}).",
    )
    parser.add_argument(
        "--model_name",
        type=str,
        default=DEFAULT_MODEL_NAME,
        help=f"The Gemini model to use for translation (default: {DEFAULT_MODEL_NAME})."
    )
    parser.add_argument(
        "--force_overwrite",
        action="store_true",
        help="Overwrite existing translated files without asking.",
    )

    args = parser.parse_args()

    source_path = Path(args.source_dir).resolve()
    target_language_name = args.target_language.lower()
    # Create target_dir in the parent of source_path, i.e. workspace root if source_dir is 'english/'
    target_path_root = source_path.parent / target_language_name

    if not source_path.is_dir():
        print(f"Error: Source directory '{source_path}' not found.")
        return

    try:
        target_path_root.mkdir(parents=True, exist_ok=True)
        print(f"Target directory created/exists: '{target_path_root}'")
    except OSError as e:
        print(f"Error creating target directory '{target_path_root}': {e}")
        return

    # Configure safety settings
    safety_settings = [ # Adjust safety settings if needed, e.g., for less restrictive blocking.
        types.SafetySetting(
            category="HARM_CATEGORY_HARASSMENT",
            threshold="BLOCK_MEDIUM_AND_ABOVE"
        ),
        types.SafetySetting(
            category="HARM_CATEGORY_HATE_SPEECH",
            threshold="BLOCK_MEDIUM_AND_ABOVE"
        ),
        types.SafetySetting(
            category="HARM_CATEGORY_SEXUALLY_EXPLICIT",
            threshold="BLOCK_MEDIUM_AND_ABOVE"
        ),
        types.SafetySetting(
            category="HARM_CATEGORY_DANGEROUS_CONTENT",
            threshold="BLOCK_MEDIUM_AND_ABOVE"
        ),
    ]

    try:
        client = genai.Client(api_key=GEMINI_API_KEY)
        print(f"Using Gemini model: {args.model_name} via Client")
    except Exception as e:
        print(f"Error initializing Gemini Client or validating model '{args.model_name}': {e}")
        return

    print(f"Starting translation from '{source_path}' ({args.source_language}) to '{target_path_root}' ({args.target_language})...")

    for root, dirs, files in os.walk(source_path):
        relative_path = Path(root).relative_to(source_path)
        target_dir_current = target_path_root / relative_path

        try:
            target_dir_current.mkdir(parents=True, exist_ok=True)
        except OSError as e:
            print(f"Warning: Could not create directory {target_dir_current}: {e}. Skipping.")
            continue

        for file_name in files:
            source_file_path = Path(root) / file_name
            target_file_path = target_dir_current / file_name

            if not args.force_overwrite and target_file_path.exists():
                user_input = input(
                    f"File '{target_file_path}' already exists. Overwrite? (y/N/all): "
                ).lower()
                if user_input == 'all':
                    args.force_overwrite = True
                elif user_input != 'y':
                    print(f"Skipping '{source_file_path}'.")
                    continue

            print(f"Processing '{source_file_path}' -> '{target_file_path}'")

            if source_file_path.suffix.lower() == ".md":
                try:
                    with open(source_file_path, "r", encoding="utf-8") as f:
                        original_content = f.read()

                    if not original_content.strip():
                        print(f"Skipping empty file: '{source_file_path}'")
                        with open(target_file_path, "w", encoding="utf-8") as f:
                            f.write("") # Create empty file in target
                        continue

                    # Check if file is too large and needs chunking
                    if len(original_content) > MAX_CHUNK_CHARS:
                        print(f"  File is large ({len(original_content)} chars), splitting into chunks...")
                        chunks = split_markdown_into_chapters(original_content)
                        print(f"  Split into {len(chunks)} chunks")

                        translated_chunks = []
                        for i, chunk in enumerate(chunks, 1):
                            print(f"  Translating chunk {i}/{len(chunks)}...")
                            translated_chunk = translate_text(
                                client, args.model_name, chunk, target_language_name, args.source_language, safety_settings
                            )
                            if translated_chunk:
                                translated_chunks.append(translated_chunk)
                            else:
                                print(f"  Warning: Failed to translate chunk {i}, using original")
                                translated_chunks.append(chunk)

                        # Join chunks back together
                        translated_content = "\n\n".join(translated_chunks)
                    else:
                        translated_content = translate_text(
                            client, args.model_name, original_content, target_language_name, args.source_language, safety_settings
                        )

                    if translated_content:
                        with open(target_file_path, "w", encoding="utf-8") as f:
                            f.write(translated_content)
                        print(f"Successfully translated and saved '{target_file_path}'")
                    else:
                        print(f"Failed to translate '{source_file_path}'. Original content may have been saved with error markers.")
                        # If translate_text returned None (unexpected), or error-marked content, it's already handled by saving it.
                except Exception as e:
                    print(f"Error processing Markdown file '{source_file_path}': {e}")
                    try: # Try to copy original on error
                        shutil.copy2(source_file_path, target_file_path)
                        print(f"Copied original '{source_file_path}' to '{target_file_path}' due to processing error.")
                    except Exception as copy_e:
                        print(f"Could not even copy original file {source_file_path}: {copy_e}")

            else: # Non-markdown files
                try:
                    shutil.copy2(source_file_path, target_file_path)
                    print(f"Copied non-markdown file '{target_file_path}'")
                except Exception as e:
                    print(f"Error copying file '{source_file_path}': {e}")

    print("Translation process finished.")
    print(f"Please check the '{target_path_root}' directory for translated files.")
    print("Remember to review translations, especially for complex Markdown or nuanced language.")

if __name__ == "__main__":
    main()