From 1078d0d2033e011083df27a1287f331ae44ddbd1 Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Sat, 11 Oct 2025 03:00:17 +0000 Subject: [PATCH] Optimize embed_community_reports The optimization replaces pandas' `.apply()` with lambda function with a direct list comprehension approach, yielding a **27% speedup**. **Key Changes:** - **Eliminated pandas `.apply()` overhead**: The original code used `reports_df.loc[:, source_col].apply(lambda x: embedder.embed(x))` which has significant pandas overhead for element-wise operations - **Direct list comprehension**: Replaced with `src = reports_df[source_col].to_list()` followed by `embeddings = [embedder.embed(x) for x in src]` - **Reduced pandas Series operations**: Converted to native Python list processing before assigning back to the DataFrame **Why This is Faster:** 1. **Pandas `.apply()` overhead**: Each `.apply()` call has internal pandas machinery that processes each element through the pandas Series infrastructure 2. **Lambda function overhead**: Creating and calling lambda functions for each row adds computational cost 3. **List comprehension efficiency**: Native Python list comprehensions are highly optimized in CPython and avoid pandas' internal overhead **Performance Characteristics:** - **Best for moderate to large datasets**: Shows 18-40% improvements across test cases with varying DataFrame sizes - **Consistent gains**: Even small DataFrames (single row) see 37-40% speedup - **Scales well**: Large DataFrames (1000 rows) maintain 18-20% improvements - **Edge cases preserved**: Handles None values, mixed types, and empty DataFrames correctly while maintaining the performance benefit The line profiler shows the bottleneck shifted from a single expensive `.apply()` operation (93.4% of time) to three more balanced operations: list conversion (9.8%), embedding computation (33.5%), and DataFrame assignment (49.3%). --- graphrag/query/indexer_adapters.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/graphrag/query/indexer_adapters.py b/graphrag/query/indexer_adapters.py index 0c6e54a8af..877d2bf780 100644 --- a/graphrag/query/indexer_adapters.py +++ b/graphrag/query/indexer_adapters.py @@ -228,9 +228,12 @@ def embed_community_reports( raise ValueError(error_msg) if embedding_col not in reports_df.columns: - reports_df[embedding_col] = reports_df.loc[:, source_col].apply( - lambda x: embedder.embed(x) - ) + # Avoid using .apply with a lambda for improved performance. + # Use a list comprehension, which is faster for element-wise operations in pandas. + src = reports_df[source_col].to_list() + # No change in behavior, ensures a list of same length as DataFrame + embeddings = [embedder.embed(x) for x in src] + reports_df[embedding_col] = embeddings return reports_df