Image2Code/extract_code_llm.py at dev · pareshnagore/Image2Code · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
import ollama
import sys
import os
import base64
from PIL import Image
from dotenv import load_dotenv
import json

from core.logger import get_logger

load_dotenv()

logger = get_logger(__name__)

# MODEL = "qwen3-vl:8b"   # change to 4b if needed
MODEL = "qwen3-vl:235b-cloud"
# MODEL = "gemma3:27b-cloud"
OLLAMA_HOST = "http://localhost:11434"  # Change if Ollama is on different host/port

PROMPT = """
You are a precise OCR engine specialized in extracting source code.

Your task:
1. Extract ONLY the source code from the image.
2. Identify the programming language
3. Identify the output file format using language and code structure (e.g. if it's Python code, save as .py; if it's HTML, save as .html, if it's python code but looks like a notebook, save as .ipynb, etc.)
4. Identify file name if it is visible in the image (e.g. from an ACTIVE IDE tab) and use it for output file naming.


Special handling for Jupyter notebooks:
- Extract only code cells
- Ignore outputs
- Ignore execution numbers
- Do NOT merge cells
- Preserve separate cells exactly including markdown cells if they are present
- Ensure valid notebook structure with "cells" array


STRICT RULES FOR FILENAME DETECTION:

1. ONLY use the ACTIVE tab (highlighted / brighter tab)
2. NEVER use background tabs
3. NEVER invent filenames
4. If filename NOT visible, use empty string ""

STRICT RULES FOR FORMAT DETECTION:

Priority order:
1. Filename extension from ACTIVE tab (highest priority), if confidence of active tab detection is high (>80% confidence), else move to next priority
2. Syntax visible in code (lowest priority)

CRITICAL RULES:

1. Output ONLY code
2. Do NOT explain anything
3. Do NOT add comments
4. Do NOT fix or modify code
5. Do NOT hallucinate missing parts
6. Preserve exact indentation
7. Preserve exact symbols
8. Preserve exact variable names
9. Ignore IDE UI elements
10. Preserve line breaks, comments, etc. as they are in the image


OUTPUT FORMAT (STRICT JSON):

{
  "format": "ipynb | python | javascript | html | cpp | json | yaml | text",
  "language": "python | javascript | cpp | html | ...",
  "filename": "optional_filename"  # include if filename is visible in the image, otherwise omit
  "line_numbers_visible": true,
  "start_line": 1,
  "end_line": 50,
  "total_lines": 50,
  "is_notebook": false,
  "cells": [
    {
      "type": "code | markdown",
      "content": "exact code or markdown for this cell",
      "start_line": 1,
      "end_line": 10
    },
  ]
}


ADDITIONAL REQUIRED METADATA FOR MULTI-IMAGE RECONSTRUCTION:

The output JSON MUST also include these fields:

{
  "tab_name": "exact visible ACTIVE tab name or empty string",
  "line_numbers_visible": true or false,
  "start_line": integer if line numbers visible, else null,
  "end_line": integer if line numbers visible, else null,
  "total_lines": integer count of lines visible in this image,
  "is_notebook": true or false
}

RULES:

- start_line and end_line MUST be exact numbers if visible
- NEVER guess line numbers
- If line numbers not visible, use null
- total_lines MUST count only actual code lines extracted
- tab_name MUST match exactly visible active tab text
- These fields are REQUIRED even if null
"""

def extract_code(image_path):
    # Read and encode image as base64
    with open(image_path, "rb") as f:
        image_data = base64.b64encode(f.read()).decode("utf-8")

    logger.debug(f"Image encoded to base64 | size: {len(image_data)} bytes", extra={"image": image_path})

    client = ollama.Client(
        host=os.getenv("OLLAMA_CLOUD_HOST", OLLAMA_HOST),
        headers={
            "Authorization": f"Bearer {os.getenv('OLLAMA_API_KEY')}"
        }
    )

    logger.info(f"Calling LLM model: {MODEL}", extra={"image": image_path})

    response = client.chat(
        model=MODEL,
        messages=[
            {
                "role": "user",
                "content": PROMPT,
                "images": [image_path]
            }
        ],
        options={
            "temperature": 0.0,
            "top_p": 0.0,
            "top_k": 1,
            "repeat_penalty": 1.0,
            "seed": 42,
            "num_predict": 4096,
            "num_ctx": 8192
        },
        format="json"
    )

    logger.debug(f"LLM response received | model: {MODEL}", extra={"image": image_path})

    return response['message']['content']

def parse_response(response_text):
    try:
        # Sometimes model wraps JSON in ```json ```
        if "```" in response_text:
            response_text = response_text.split("```")[1]
            response_text = response_text.replace("json", "", 1).strip()
        data = json.loads(response_text)
        # format_type = data.get("format", "python")
        # language = data.get("language", "python")
        # filename = data.get("filename", "")
        # cells = data.get("cells", [])
        return data
    except Exception as e:
        # fallback if model returns raw code
        logger.warning(f"JSON parsing failed, using fallback | error: {str(e)}")
        return {
            "language": "python",
            "format": "py",
            "code": response_text
        }

def save_as_ipynb(cells, output_file):
    notebook = {
        "cells": [],
        "metadata": {},
        "nbformat": 4,
        "nbformat_minor": 5
    }
    for cell in cells:
        notebook["cells"].append({
            "cell_type": "code",
            "execution_count": None,
            "metadata": {},
            "outputs": [],
            "source": cell["content"].splitlines(keepends=True)
        })
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(notebook, f, indent=2)

# def save_as_ipynb(code, output_file):
#     notebook = {
#         "cells": [
#             {
#                 "cell_type": "code",
#                 "execution_count": None,
#                 "metadata": {},
#                 "outputs": [],
#                 "source": code.splitlines(keepends=True)
#             }
#         ],
#         "metadata": {},
#         "nbformat": 4,
#         "nbformat_minor": 5
#     }
#     with open(output_file, "w", encoding="utf-8") as f:
#         json.dump(notebook, f, indent=2)

def save_output(code, output_file):
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(code)

def save_as_code(cells, output_file):
    with open(output_file, "w", encoding="utf-8") as f:
        for i, cell in enumerate(cells):
            f.write(cell["content"])
            if i != len(cells) - 1:
                f.write("\n")

def main():
    if len(sys.argv) < 2:
        logger.error("No image file provided")
        print("Usage:")
        print("python extract_code_llm.py image.jpg")
        return

    image_path = sys.argv[1]

    if not os.path.exists(image_path):
        logger.error(f"Image file not found | path: {image_path}")
        return

    filename = os.path.basename(image_path)
    name = os.path.splitext(filename)[0]

    output_file = f"outputs/{name}.py"

    logger.info(f"Starting extraction | image: {image_path}", extra={"image": image_path})

    response = extract_code(image_path)

    logger.debug("Raw response from model received", extra={"response_length": len(response)})

    parsed = parse_response(response)

    format_ext = parsed.get("format", "python")
    language = parsed.get("language", "python")
    filename_from_image = parsed.get("filename", "")
    cells = parsed.get("cells", [])

    logger.debug(f"Response parsed | format: {format_ext} | language: {language} | cells: {len(cells)}")

    # language = parsed["language"].lower()
    # format_ext = parsed["format"].lower()
    # code = parsed["code"]
    # filename_from_image = parsed.get("filename")
    if filename_from_image:
        filename = filename_from_image.lower()
    else:
        filename = name.lower()+"." + format_ext

    # choose correct extension
    output_file = f"outputs/{filename}"

    logger.info(f"Saving code | format: {format_ext} | language: {language} | output: {output_file}")

    try:
        if format_ext == "ipynb":
            save_as_ipynb(cells, output_file)
        else:
            save_as_code(cells, output_file)

        logger.info(f"Code saved successfully | file: {output_file}")
    except Exception as e:
        logger.error(f"Failed to save code | file: {output_file} | error: {str(e)}")

if __name__ == "__main__":
    main()