This commit is contained in:
ilia 2025-07-11 09:32:01 -08:00
parent 70e3b66c95
commit 2265649669
6 changed files with 1163 additions and 674 deletions

3
.gitignore vendored
View File

@ -1 +1,2 @@
.history
.history
*.png

13
.vscode/launch.json vendored Normal file
View File

@ -0,0 +1,13 @@
{
"version": "0.2.0",
"configurations": [
{"name":"Python Debugger: Current File","type":"debugpy","request":"launch","program":"${file}","console":"integratedTerminal"},
{
"name": "Python: Current File",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal"
}
]
}

433
clear Normal file
View File

@ -0,0 +1,433 @@
{
"pdf_file_processed": "test2.pdf",
"pdf_full_path": "/mnt/c/Users/admin/Downloads/test2.pdf",
"pages_processed_spec": "5",
"extraction_timestamp": "2025-06-03 08:55:13 EDT",
"total_highlights_extracted": 20,
"settings_used": {
"clean_edges": true,
"show_diff_percentage": true
},
"highlights_data": [
{
"page": 5,
"highlight_id_on_page": 1,
"text": "or prejudice in",
"color": "yellow",
"raw_rgb_values": [
1.0,
0.9411770105361938,
0.4000000059604645
],
"type": "highlight",
"y_position": 53.75,
"x_position": 60.0,
"rect_details": [
60.0,
53.75,
116.0,
63.75
],
"num_segments": 1
},
{
"page": 5,
"highlight_id_on_page": 2,
"text": "unin",
"color": "yellow",
"raw_rgb_values": [
1.0,
0.9411770105361938,
0.4000000059604645
],
"type": "highlight",
"y_position": 77.75,
"x_position": 164.0,
"rect_details": [
164.0,
77.75,
169.0,
87.75
],
"num_segments": 1
},
{
"page": 5,
"highlight_id_on_page": 3,
"text": "uninformed about how language can stand as a barrier to jus-",
"color": "yellow",
"raw_rgb_values": [
1.0,
0.9411770105361938,
0.4000000059604645
],
"type": "highlight",
"y_position": 77.75,
"x_position": 164.0,
"rect_details": [
164.0,
77.75,
405.0,
87.75
],
"num_segments": 1
},
{
"page": 5,
"highlight_id_on_page": 4,
"text": "tice or equal opportunity.",
"color": "yellow",
"raw_rgb_values": [
1.0,
0.9411770105361938,
0.4000000059604645
],
"type": "highlight",
"y_position": 89.75,
"x_position": 60.0,
"rect_details": [
60.0,
89.75,
158.0,
99.75
],
"num_segments": 1
},
{
"page": 5,
"highlight_id_on_page": 5,
"text": "linguistics,",
"color": "yellow",
"raw_rgb_values": [
1.0,
0.9411770105361938,
0.4000000059604645
],
"type": "highlight",
"y_position": 137.75,
"x_position": 188.0,
"rect_details": [
188.0,
137.75,
226.0,
147.75
],
"num_segments": 1
},
{
"page": 5,
"highlight_id_on_page": 6,
"text": "needs to make applied contributions to the understanding and solution of racial discrimination, criminal injustice, and other social problems.",
"color": "yellow",
"raw_rgb_values": [
1.0,
0.9411770105361938,
0.4000000059604645
],
"type": "highlight",
"y_position": 149.75,
"x_position": 60.0,
"rect_details": [
60.0,
149.75,
408.0,
171.75
],
"num_segments": 2
},
{
"page": 5,
"highlight_id_on_page": 7,
"text": "first",
"color": "blue",
"raw_rgb_values": [
0.5607839822769165,
0.8705880045890808,
0.9764710068702698
],
"type": "highlight",
"y_position": 173.75,
"x_position": 182.0,
"rect_details": [
182.0,
173.75,
198.0,
183.75
],
"num_segments": 1
},
{
"page": 5,
"highlight_id_on_page": 8,
"text": "at interpreters are not generally provided for dialects of a language, only for foreign languages",
"color": "yellow",
"raw_rgb_values": [
1.0,
0.9411770105361938,
0.4000000059604645
],
"type": "highlight",
"y_position": 197.75,
"x_position": 60.0,
"rect_details": [
60.0,
197.75,
408.0,
219.75
],
"num_segments": 2
},
{
"page": 5,
"highlight_id_on_page": 9,
"text": "(§2),",
"color": "blue",
"raw_rgb_values": [
0.5607839822769165,
0.8705880045890808,
0.9764710068702698
],
"type": "highlight",
"y_position": 197.75,
"x_position": 182.0,
"rect_details": [
182.0,
197.75,
201.0,
207.75
],
"num_segments": 1
},
{
"page": 5,
"highlight_id_on_page": 10,
"text": "§3",
"color": "blue",
"raw_rgb_values": [
0.5607839822769165,
0.8705880045890808,
0.9764710068702698
],
"type": "highlight",
"y_position": 209.75,
"x_position": 398.0,
"rect_details": [
398.0,
209.75,
408.0,
219.75
],
"num_segments": 1
},
{
"page": 5,
"highlight_id_on_page": 11,
"text": "specific case of Rachel Jeantels dialect, a",
"color": "yellow",
"raw_rgb_values": [
1.0,
0.9411770105361938,
0.4000000059604645
],
"type": "highlight",
"y_position": 221.75,
"x_position": 84.0,
"rect_details": [
84.0,
221.75,
241.0,
231.75
],
"num_segments": 1
},
{
"page": 5,
"highlight_id_on_page": 12,
"text": "whether the credibility and intelligibility problems that led jurors to disregard Jeantels testimony were due",
"color": "yellow",
"raw_rgb_values": [
1.0,
0.9411770105361938,
0.4000000059604645
],
"type": "highlight",
"y_position": 269.75,
"x_position": 60.0,
"rect_details": [
60.0,
269.75,
408.0,
291.75
],
"num_segments": 2
},
{
"page": 5,
"highlight_id_on_page": 13,
"text": "§4 we",
"color": "blue",
"raw_rgb_values": [
0.5607839822769165,
0.8705880045890808,
0.9764710068702698
],
"type": "highlight",
"y_position": 269.75,
"x_position": 237.0,
"rect_details": [
237.0,
269.75,
257.0,
279.75
],
"num_segments": 1
},
{
"page": 5,
"highlight_id_on_page": 14,
"text": "dialect and insti-",
"color": "yellow",
"raw_rgb_values": [
1.0,
0.9411770105361938,
0.4000000059604645
],
"type": "highlight",
"y_position": 317.75,
"x_position": 342.0,
"rect_details": [
342.0,
317.75,
402.0,
327.75
],
"num_segments": 1
},
{
"page": 5,
"highlight_id_on_page": 15,
"text": "tutionalized racism negatively impact AAVE and other vernacular speakers i",
"color": "yellow",
"raw_rgb_values": [
1.0,
0.9411770105361938,
0.4000000059604645
],
"type": "highlight",
"y_position": 329.75,
"x_position": 60.0,
"rect_details": [
60.0,
329.75,
367.0,
339.75
],
"num_segments": 1
},
{
"page": 5,
"highlight_id_on_page": 16,
"text": "(§5).",
"color": "blue",
"raw_rgb_values": [
0.5607839822769165,
0.8705880045890808,
0.9764710068702698
],
"type": "highlight",
"y_position": 341.75,
"x_position": 342.0,
"rect_details": [
342.0,
341.75,
355.0,
351.75
],
"num_segments": 1
},
{
"page": 5,
"highlight_id_on_page": 17,
"text": "summarize our conclusions a",
"color": "yellow",
"raw_rgb_values": [
1.0,
0.9411770105361938,
0.4000000059604645
],
"type": "highlight",
"y_position": 353.75,
"x_position": 60.0,
"rect_details": [
60.0,
353.75,
170.0,
363.75
],
"num_segments": 1
},
{
"page": 5,
"highlight_id_on_page": 18,
"text": "(§6).",
"color": "blue",
"raw_rgb_values": [
0.5607839822769165,
0.8705880045890808,
0.9764710068702698
],
"type": "highlight",
"y_position": 365.75,
"x_position": 220.0,
"rect_details": [
220.0,
365.75,
236.0,
375.75
],
"num_segments": 1
},
{
"page": 5,
"highlight_id_on_page": 19,
"text": "at nonstandard or vernacular dialects",
"color": "yellow",
"raw_rgb_values": [
1.0,
0.9411770105361938,
0.4000000059604645
],
"type": "highlight",
"y_position": 407.75,
"x_position": 206.0,
"rect_details": [
206.0,
407.75,
340.0,
417.75
],
"num_segments": 1
},
{
"page": 5,
"highlight_id_on_page": 20,
"text": "spoken most frequently and fluently by ethnic minorities and/or by less educated, working-class,orpoorpeopleworldwide.1",
"color": "yellow",
"raw_rgb_values": [
1.0,
0.9411770105361938,
0.4000000059604645
],
"type": "highlight",
"y_position": 431.75,
"x_position": 60.0,
"rect_details": [
60.0,
431.75,
408.0,
453.75
],
"num_segments": 2
}
]
}

1386
main.py

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,2 @@
pdfplumber==0.10.3
colorama==0.4.6
pandas==2.0.3
PyMuPDF==1.23.1

BIN
test/test2.pdf Normal file

Binary file not shown.