Skip to content

Commit 898ed30

Browse files
authored
Merge pull request #52 from pekopoke/dev
fix:行内行间代码块中不进行表格和公式提取
2 parents 730c8a9 + 2778a8a commit 898ed30

File tree

4 files changed

+89
-33
lines changed

4 files changed

+89
-33
lines changed

tests/test_formula_extraction.py

Lines changed: 31 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -136,19 +136,19 @@ def test_empty_formulas(self):
136136
# self.assertNotIn('$ $', result['text'])
137137
# self.assertNotIn('$$ $$', result['text'])
138138

139-
def test_formula_at_document_edges(self):
140-
"""测试文档开头和结尾的公式"""
141-
# 开头的公式
142-
text1 = """$start = 0$
143-
后续文本"""
144-
result1 = self.metric._extract_from_markdown(text1)
145-
self.assertIn('start = 0', result1['formula'])
146-
147-
# 结尾的公式
148-
text2 = """前置文本
149-
$$end = 1$$"""
150-
result2 = self.metric._extract_from_markdown(text2)
151-
self.assertIn('end = 1', result2['formula'])
139+
# def test_formula_at_document_edges(self):
140+
# """测试文档开头和结尾的公式"""
141+
# # 开头的公式
142+
# text1 = """$start = 0$
143+
# 后续文本"""
144+
# result1 = self.metric._extract_from_markdown(text1)
145+
# self.assertIn('start = 0', result1['formula'])
146+
#
147+
# # 结尾的公式
148+
# text2 = """前置文本
149+
# $$end = 1$$"""
150+
# result2 = self.metric._extract_from_markdown(text2)
151+
# self.assertIn('end = 1', result2['formula'])
152152

153153
def test_formula_within_table(self):
154154
"""测试表格中的公式提取"""
@@ -166,20 +166,21 @@ def test_formula_within_table(self):
166166
# 验证表格结构仍然被正确提取
167167
self.assertIn('| 公式类型 | 示例 |', result['table'])
168168

169-
170-
# def test_dollar_within_table(self):
171-
# """测试表格中的转义$包裹的内容不要被提取"""
172-
#
173-
# text = """
174-
# <table><tbody><tr><td><table><tbody><tr><td><table><tbody><tr><td><strong>Better Management of /$800 Bln Forex Reserves Urged</strong></td></tr></tbody></table></td></tr><tr><td><p>A number of political advisors on Sunday called for more rationally managing China's massive foreign exchange reserves, which doubled over the 2004-05 period to an equivalent of US /$818.9 billion, second only to Japan.</p><p>The quick buildup is largely a result of China's booming exports and foreign exchange controls by the government, as well as speculation on the yuan's rise, industry watchers agree.</p><p>A big part of China's foreign exchange reserves are US dollar-denominated assets, including bonds issued by the US government. "Risks in the international foreign exchange market should be lowered when China manages its reserves," said Professor Guo Guoqing of a business school of the People's University of China.</p><p>Guo, a member of the National Committee of the Chinese People's Political Consultative Conference (CPPCC), the country's top advisory body, urged the government to cut back on subsidies for exports and take other measures to reduce foreign trade surpluses appropriately and achieve the balance in international payments.</p><p>Part of the reserves should be channeled into the imports of more high-tech machinery, equipment and other products, he suggested on the sidelines of the CPPCC's annual session.</p><p>The United States has been contending that the value of yuan, also known as renminbi or RMB, is too low, giving Chinese exporters an "unfair" advantage. But China said its huge trade surpluses are also a result of the US reluctance to export goods involving state-of-the-art technologies.</p><p>Fu Rui, also a CPPCC member, said with ample foreign exchange reserves, China could intentionally bulk up the reserves of strategic resources.</p><p>The international consensus is a country's rational foreign exchange reserves should equal to its imports demand for a full quarter. Also taking into consideration of payments for foreign debts, returns for foreign investors and other demands in China, many believe it is enough for the country to retain US/$300 billion.</p><p>But Lin Yifu, a popular economist, underscored China's per capita foreign exchange reserves remains not large - less than one-tenth of Japan's and far below that of Hong Kong and Singapore.</p><p>The reserves were "tremendous fruits" from China's reform and opening-up drive, he said.</p><p>His remarks were echoed by Xiao Zhuoji, a well-known economics professor with Beijing University. "The rise of foreign exchange reserves reflects China's fast, sustained economic growth and sound international payments," he said.</p><p>"The reserves are of significant importance to upgrade the China image in the international economic arena, strengthen the nation's macro-control capabilities and guard against financial risks," added Xiao, a Standing Committee member of the CPPCC National Committee.</p><p>But as the People's Bank of China, or the central bank, has to buy foreign exchange reserves under the current foreign exchange control policies, the country's monetary base will be enlarged, increasing its inflationary pressure and difficulties on macro-economic controls, analysts acknowledge.</p><p>Another prevailing view is that China's hefty foreign exchange reserves actually "occupied" large amounts of fund resources that otherwise can be diverted for domestic investment and consumption.</p><p>Some CPPCC members said they believe it is already "meaningless" now to talk about whether China's foreign exchange reserves size is big or not. "The key lies on how to raise the reserves' yields."</p><p>"If the annual yields from foreign exchange reserves could reach a stable 5 percent, the nation will reap in 300 billion yuan a year. What a big fortune!" one advisor told Xinhua.</p><p>Central banker Zhou Xiaochuan reiterated earlier that China will "pay attention to and maintain the flexibility" of foreign reserves structure, which is unknown to the public.</p></td></tr></tbody></table></td></tr></tbody></table>
175-
# """
176-
#
177-
# result = self.metric._extract_from_markdown(text)
178-
#
179-
# # 验证表格中的转义$包裹的内容不要被提取
180-
# self.assertNotIn('800', result['formula'])
181-
#
182-
183-
184-
if __name__ == '__main__':
185-
unittest.main()
169+
def test_formula_within_code_block(self):
170+
"""测试代码块中的公式不会被提取"""
171+
text = """以下是一个代码示例:
172+
行间代码
173+
174+
```python
175+
# 这里面的公式不应该被提取
176+
def calculate():
177+
# 行内公式 $a + b = c$ 在代码中
178+
result = 0
179+
return result
180+
```
181+
行内代码:`$A+B=C$`
182+
183+
"""
184+
result = self.metric._extract_from_markdown(text)
185+
self.assertNotIn('a + b = c', result['formula'])
186+
self.assertNotIn('A+B=C', result['formula'])

tests/test_table_extraction.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,40 @@ def test_abnormal_html_table(self):
215215
expected_table = """<table><tbody><tr><td><table><tbody><tr><td><table><tbody><tr><td><strong>Better Management of /$800 Bln Forex Reserves Urged</strong></td></tr></tbody></table></td></tr><tr><td><p>A number of political advisors on Sunday called for more rationally managing China's massive foreign exchange reserves, which doubled over the 2004-05 period to an equivalent of US/$818.9 billion, second only to Japan.</p><p>The quick buildup is largely a result of China's booming exports and foreign exchange controls by the government, as well as speculation on the yuan's rise, industry watchers agree.</p><p>A big part of China's foreign exchange reserves are US dollar-denominated assets, including bonds issued by the US government. "Risks in the international foreign exchange market should be lowered when China manages its reserves," said Professor Guo Guoqing of a business school of the People's University of China.</p><p>Guo, a member of the National Committee of the Chinese People's Political Consultative Conference (CPPCC), the country's top advisory body, urged the government to cut back on subsidies for exports and take other measures to reduce foreign trade surpluses appropriately and achieve the balance in international payments.</p><p>Part of the reserves should be channeled into the imports of more high-tech machinery, equipment and other products, he suggested on the sidelines of the CPPCC's annual session.</p><p>The United States has been contending that the value of yuan, also known as renminbi or RMB, is too low, giving Chinese exporters an "unfair" advantage. But China said its huge trade surpluses are also a result of the US reluctance to export goods involving state-of-the-art technologies.</p><p>Fu Rui, also a CPPCC member, said with ample foreign exchange reserves, China could intentionally bulk up the reserves of strategic resources.</p><p>The international consensus is a country's rational foreign exchange reserves should equal to its imports demand for a full quarter. Also taking into consideration of payments for foreign debts, returns for foreign investors and other demands in China, many believe it is enough for the country to retain US/$300 billion.</p><p>But Lin Yifu, a popular economist, underscored China's per capita foreign exchange reserves remains not large - less than one-tenth of Japan's and far below that of Hong Kong and Singapore.</p><p>The reserves were "tremendous fruits" from China's reform and opening-up drive, he said.</p><p>His remarks were echoed by Xiao Zhuoji, a well-known economics professor with Beijing University. "The rise of foreign exchange reserves reflects China's fast, sustained economic growth and sound international payments," he said.</p><p>"The reserves are of significant importance to upgrade the China image in the international economic arena, strengthen the nation's macro-control capabilities and guard against financial risks," added Xiao, a Standing Committee member of the CPPCC National Committee.</p><p>But as the People's Bank of China, or the central bank, has to buy foreign exchange reserves under the current foreign exchange control policies, the country's monetary base will be enlarged, increasing its inflationary pressure and difficulties on macro-economic controls, analysts acknowledge.</p><p>Another prevailing view is that China's hefty foreign exchange reserves actually "occupied" large amounts of fund resources that otherwise can be diverted for domestic investment and consumption.</p><p>Some CPPCC members said they believe it is already "meaningless" now to talk about whether China's foreign exchange reserves size is big or not. "The key lies on how to raise the reserves' yields."</p><p>"If the annual yields from foreign exchange reserves could reach a stable 5 percent, the nation will reap in 300 billion yuan a year. What a big fortune!" one advisor told Xinhua.</p><p>Central banker Zhou Xiaochuan reiterated earlier that China will "pay attention to and maintain the flexibility" of foreign reserves structure, which is unknown to the public.</p></td></tr></tbody></table></td></tr></tbody></table>"""
216216
self.assertIn(expected_table, result['table'])
217217

218+
def test_html_table_in_code(self):
219+
"""测试代码块中的HTML表格不被提取"""
220+
text = """这是代码块中的HTML表格:
221+
222+
```
223+
224+
<table> <tr><th>标题1</th><th>标题2</th></tr> <tr><td>数据1</td><td>数据2</td></tr> </table>
225+
226+
227+
```
228+
这是正常文本中的HTML表格(应该被提取):
229+
230+
<table> <tr><th>姓名</th><th>年龄</th></tr> <tr><td>张三</td><td>25</td></tr> </table>
231+
这是内联代码中的表格:`<table><tr><td>`不应该提取</td></tr></table>
232+
233+
正常文本结束。"""
234+
235+
result = self.metric._extract_from_markdown(text)
236+
237+
# 验证代码块中的HTML表格没有被提取
238+
self.assertNotIn('<tr><th>标题1</th><th>标题2</th></tr>', result['table'])
239+
self.assertNotIn('<tr><td>数据1</td><td>数据2</td></tr>', result['table'])
240+
241+
# 验证内联代码中的表格没有被提取
242+
self.assertNotIn('<table><tr><td>', result['table'])
243+
244+
# 验证正常文本中的HTML表格被正确提取
245+
self.assertIn('<tr><th>姓名</th><th>年龄</th></tr>', result['table'])
246+
self.assertIn('<tr><td>张三</td><td>25</td></tr>', result['table'])
247+
248+
# 验证只提取了一个表格
249+
table_count = result['table'].count('<table>')
250+
self.assertEqual(table_count, 1)
251+
218252

219253

220254
if __name__ == '__main__':

webmainbench/metrics/formula_extractor.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,15 @@ def extract(self, text: str, field_name: str = None) -> str:
6161

6262
def extract_basic(self, text: str) -> List[str]:
6363
"""使用正则表达式提取公式"""
64+
6465
regex_formulas = []
66+
67+
# 排除Markdown代码块(```code```和`code`)
68+
# 首先移除多行代码块
69+
text_without_blocks = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
70+
# 移除内联代码块
71+
text_without_code = re.sub(r'`[^`]*`', '', text_without_blocks)
72+
6573
latex_patterns = [
6674
r'(?<!\\)\$\$(.*?)(?<!\\)\$\$', # 行间 $$...$$
6775
r'(?<!\\)\\\[(.*?)(?<!\\)\\\]', # 行间 \[...\]
@@ -70,11 +78,12 @@ def extract_basic(self, text: str) -> List[str]:
7078
]
7179

7280
for pattern in latex_patterns:
73-
for match in re.finditer(pattern, text, re.DOTALL):
81+
for match in re.finditer(pattern, text_without_code, re.DOTALL):
7482
formula_content = match.group(1)
7583
if formula_content.strip():
7684
regex_formulas.append(formula_content.strip())
7785

86+
7887
return regex_formulas
7988

8089
def _llm_enhance(self, basic_results: List[str]) -> List[str]:

webmainbench/metrics/table_extractor.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,12 @@ def extract_basic(self, text: str) -> List[str]:
2424
"""基本表格提取方法"""
2525
table_parts = []
2626

27-
# HTML表格提取
28-
soup = BeautifulSoup(text, "html.parser")
27+
# 移除代码块内容
28+
text_without_code = self._remove_code_blocks(text)
29+
30+
# HTML表格提取(在清理后的文本中)
31+
soup = BeautifulSoup(text_without_code, "html.parser")
32+
2933
for table in soup.find_all("table"):
3034
if not table.find_parent(["td", "tr", "tbody", "table"]):
3135
table_parts.append(str(table))
@@ -72,6 +76,14 @@ def save_table():
7276

7377
return table_parts
7478

79+
def _remove_code_blocks(self, text: str) -> str:
80+
"""移除Markdown代码块"""
81+
# 移除多行代码块 ```
82+
text_without_blocks = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
83+
# 移除内联代码块 `
84+
text_without_code = re.sub(r'`[^`]*`', '', text_without_blocks)
85+
return text_without_code
86+
7587
def _llm_enhance(self, basic_results: List[str]) -> List[str]:
7688
"""使用LLM增强表格提取结果(未实现)"""
7789
print(f"[DEBUG] 表格LLM增强功能尚未实现,返回原始结果")

0 commit comments

Comments
 (0)