Siyu Wang
commited on
Commit
·
2b2a944
1
Parent(s):
1f172f7
Moved from https://huggingface.co/spaces/siyuwang541/ToDoAgent for the hackthon
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .DS_Store +0 -0
- LICENSE +201 -0
- LLM/.DS_Store +0 -0
- LLM/Database/FalseNegative_samples.json +1 -0
- LLM/Database/FalsePositive_samples.json +0 -0
- LLM/Database/Manual_filtering_samples.json +190 -0
- LLM/Database/Messages.json +0 -0
- LLM/Database/README.md +8 -0
- LLM/Database/TrueNegative_samples.json +197 -0
- LLM/Database/TruePositive_samples.json +270 -0
- LLM/Database/classify_samples.py +222 -0
- LLM/Database/negative_samples.json +1 -0
- LLM/Database/positive_samples.json +0 -0
- LLM/Database/update_Messagejson.py +63 -0
- LLM/Notify/NotifyReadme.md +50 -0
- LLM/Notify/__pycache__/dataBaseConnecter.cpython-312.pyc +0 -0
- LLM/Notify/compareDb2txt.py +130 -0
- LLM/Notify/dataBaseConnecter.py +128 -0
- LLM/Notify/db2txt.py +88 -0
- LLM/Notify/notifyMain.py +88 -0
- LLM/Notify/usrSpareTime.py +148 -0
- LLM/filter_message/README.MD +18 -0
- LLM/filter_message/__pycache__/libs.cpython-312.pyc +0 -0
- LLM/filter_message/libs.py +217 -0
- LLM/filter_message/main.py +124 -0
- LLM/filter_message/prompt.md +96 -0
- LLM/filter_message/prompt.txt +90 -0
- LLM/filter_message/requirements.txt +4 -0
- LLM/orchestrator.py +95 -0
- LLM/requirements.txt +12 -0
- LLM/todogen_LLM/FalsePositive_few_shot.txt +31 -0
- LLM/todogen_LLM/TruePositive_few_shot.txt +59 -0
- LLM/todogen_LLM/__pycache__/compare_data.cpython-312.pyc +0 -0
- LLM/todogen_LLM/__pycache__/config_loader.cpython-312.pyc +0 -0
- LLM/todogen_LLM/__pycache__/database_of_messages.cpython-312.pyc +0 -0
- LLM/todogen_LLM/__pycache__/export_todolist.cpython-312.pyc +0 -0
- LLM/todogen_LLM/__pycache__/filter_message_list.cpython-312.pyc +0 -0
- LLM/todogen_LLM/__pycache__/filter_useful_data_to_dict.cpython-312.pyc +0 -0
- LLM/todogen_LLM/__pycache__/logger_config.cpython-312.pyc +0 -0
- LLM/todogen_LLM/__pycache__/path_validator.cpython-312.pyc +0 -0
- LLM/todogen_LLM/__pycache__/receiving_useful_messages.cpython-312.pyc +0 -0
- LLM/todogen_LLM/__pycache__/todogen_llm.cpython-312.pyc +0 -0
- LLM/todogen_LLM/compare_data.py +209 -0
- LLM/todogen_LLM/config_loader.py +34 -0
- LLM/todogen_LLM/database_of_messages.py +221 -0
- LLM/todogen_LLM/export_todolist.py +59 -0
- LLM/todogen_LLM/filter_message_list.py +49 -0
- LLM/todogen_LLM/filter_useful_data_to_dict.py +163 -0
- LLM/todogen_LLM/jiaoben.py +318 -0
- LLM/todogen_LLM/logger_config.py +69 -0
.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
LLM/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
LLM/Database/FalseNegative_samples.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[]
|
LLM/Database/FalsePositive_samples.json
ADDED
File without changes
|
LLM/Database/Manual_filtering_samples.json
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"sender": "ASAP Sample",
|
4 |
+
"content": "王斯煜[表情]Vince 黑客松nv: @AlisaGG @LJK86 明天中午再对 WANG Siyu邀请你加入飞书视频会议\n会议主题:FilterLLM ToDoGenLLM PR Merge讨论\n会议时间:3月31日 (今天) 12:00 - 12:30 (GMT+8)\n会议 ID:656 445 907\n会议链接:https://...",
|
5 |
+
"app_name": "com.tencent.mm",
|
6 |
+
"message_id": 331001504,
|
7 |
+
"user_id": 67619984,
|
8 |
+
"date": "2025-03-31T00:15:21",
|
9 |
+
"classification": "TruePositive"
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"sender": "ASAP Sample",
|
13 |
+
"content": "[6条]王斯煜[表情]Vince 黑客松nv: Your Mint 3-Month, Unlimited plan expires in 2 days. Log into your account at my.mintmobile.com or via our app to make a payment and keep your Mint se...",
|
14 |
+
"app_name": "com.tencent.mm",
|
15 |
+
"message_id": 331002905,
|
16 |
+
"user_id": 67619984,
|
17 |
+
"date": "2025-03-31T00:29:45",
|
18 |
+
"classification": "TruePositive"
|
19 |
+
},
|
20 |
+
{
|
21 |
+
"sender": "ASAP Sample",
|
22 |
+
"content": "[4条]斯煜[表情]Vince: 【美团月付】您2月账单805.57元待还,最后还款日为本月22号,查账或立即还款点 dpurl.cn/HXFUxEba",
|
23 |
+
"app_name": "com.tencent.mm",
|
24 |
+
"message_id": 331002907,
|
25 |
+
"user_id": 74797059,
|
26 |
+
"date": "2025-03-31T00:29:45",
|
27 |
+
"classification": "TruePositive"
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"sender": "ASAP Sample",
|
31 |
+
"content": "[4条]斯煜[表情]Vince: 【中通快递】73547529665397放在丰巢的包裹请及时取件,如有问题致电15900077340、020-22504077。",
|
32 |
+
"app_name": "com.tencent.mm",
|
33 |
+
"message_id": 331002908,
|
34 |
+
"user_id": 74797059,
|
35 |
+
"date": "2025-03-31T00:29:45",
|
36 |
+
"classification": "TruePositive"
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"sender": "ASAP Sample",
|
40 |
+
"content": "[6条]斯煜[表情]Vince: 尊敬的*斯煜,您在我行办理的1笔个人贷款需于2024年12月08日17:00前还款,当期还款金额本息合计999999.84元,请您留意尾号0455的账户可用余额是否充足,避免因贷款逾期影响个人征信。具体贷款信息可通过工行网上银行、手机银行或致电贷款经办行查询。【工商银行】",
|
41 |
+
"app_name": "com.tencent.mm",
|
42 |
+
"message_id": 331002909,
|
43 |
+
"user_id": 74797059,
|
44 |
+
"date": "2025-03-31T00:29:45",
|
45 |
+
"classification": "TruePositive"
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"sender": "ASAP Sample",
|
49 |
+
"content": "[6条]斯煜[表情]Vince: Your Mint 3-Month, Unlimited plan expires in 2 days. Log into your account at my.mintmobile.com or via our app to make a payment and keep your Mint se...",
|
50 |
+
"app_name": "com.tencent.mm",
|
51 |
+
"message_id": 331002910,
|
52 |
+
"user_id": 74797059,
|
53 |
+
"date": "2025-03-31T00:29:45",
|
54 |
+
"classification": "TruePositive"
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"sender": "ASAP Sample",
|
58 |
+
"content": "王斯煜[表情]Vince 黑客松nv: WANG Siyu邀请你加入飞书视频会议\n会议主题:FilterLLM ToDoGenLLM PR Merge讨论\n会议时间:3月31日 (今天) 12:00 - 12:30 (GMT+8)\n会议 ID:656 445 907\n会议链接:https://vc.feishu.cn/j/65644590...",
|
59 |
+
"app_name": "com.tencent.mm",
|
60 |
+
"message_id": 331115637,
|
61 |
+
"user_id": 67619984,
|
62 |
+
"date": "2025-03-31T11:56:36",
|
63 |
+
"classification": "TruePositive"
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"sender": "ASAP Sample",
|
67 |
+
"content": "AlisaGG陈G老师1010: 【驿收发】您的邮政包裹已到凯丰花园2栋驿站,请23:00前凭5-5-6530来取,详询18320926368",
|
68 |
+
"app_name": "com.tencent.mm",
|
69 |
+
"message_id": 331133944,
|
70 |
+
"user_id": 67619984,
|
71 |
+
"date": "2025-03-31T13:39:49",
|
72 |
+
"classification": "TruePositive"
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"sender": "ASAP Sample",
|
76 |
+
"content": "[2条]AlisaGG: 【小象超市】您好,我是小象超市骑手,【美团智能外卖柜】您的外卖已送至新一代产业园2栋4号柜(面向马路侧),格口号:27,取件码:9310,存柜超过12小时将被清理,请及时取件。",
|
77 |
+
"app_name": "com.tencent.mm",
|
78 |
+
"message_id": 331134011,
|
79 |
+
"user_id": 74797059,
|
80 |
+
"date": "2025-03-31T13:40:54"
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"sender": "ASAP Sample",
|
84 |
+
"content": "[2条]AlisaGG陈G老师1010: 【小象超市】您好,我是小象超市骑手,【美团智能外卖柜】您的外卖已送至新一代产业园2栋4号柜(面向马路侧),格口号:27,取件码:9310,存柜超过12小时将被清理,请及时取件。",
|
85 |
+
"app_name": "com.tencent.mm",
|
86 |
+
"message_id": 331134045,
|
87 |
+
"user_id": 67619984,
|
88 |
+
"date": "2025-03-31T13:40:54",
|
89 |
+
"classification": "TruePositive"
|
90 |
+
},
|
91 |
+
{
|
92 |
+
"sender": "ASAP Sample",
|
93 |
+
"content": "[3条]AlisaGG陈G老师1010: 取件通知\n取件码: 11724837\n运单号: 464285154986072\n取件地址: 深圳新一代产业园P2出入口内侧4号丰巢柜\n配送人员: 18124519013\n计费规则: 查看详情",
|
94 |
+
"app_name": "com.tencent.mm",
|
95 |
+
"message_id": 331134346,
|
96 |
+
"user_id": 67619984,
|
97 |
+
"date": "2025-03-31T13:43:17",
|
98 |
+
"classification": "TruePositive"
|
99 |
+
},
|
100 |
+
{
|
101 |
+
"sender": "ASAP Sample",
|
102 |
+
"content": "[4条]AlisaGG陈G老师1010: 取件再次提醒\n取件码:: 27696201\n配送公司:: 申通快递\n运单号:: 777293635831671\n配送���手机:: 13392809673\n取件地址:: 深圳新一代产业园P2出入口内侧4号丰巢柜",
|
103 |
+
"app_name": "com.tencent.mm",
|
104 |
+
"message_id": 331134447,
|
105 |
+
"user_id": 67619984,
|
106 |
+
"date": "2025-03-31T13:44:02",
|
107 |
+
"classification": "TruePositive"
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"sender": "ASAP Sample",
|
111 |
+
"content": "[5条]李JK老师-1228: 【菜鸟驿站】请凭140-3-1019到菜鸟驿站取件,查询详情u.cainiao.com/53h4bSr7zrh",
|
112 |
+
"app_name": "com.tencent.mm",
|
113 |
+
"message_id": 331135751,
|
114 |
+
"user_id": 67619984,
|
115 |
+
"date": "2025-03-31T13:57:28",
|
116 |
+
"classification": "TruePositive"
|
117 |
+
},
|
118 |
+
{
|
119 |
+
"sender": "ASAP Sample",
|
120 |
+
"content": "刻刻: 【停机前提醒】尊敬的移动客户,您好!您的账户余额不足被限制使用。现提醒您需充值缴费至少32.49元,以确保您继续享受畅通的通信服务。诚邀您一键办理自动充服务,自动充值缴费更轻松:https://dx.10086.cn/7WyRLA。 心级服务、让爱连接【中国移动】",
|
121 |
+
"app_name": "com.tencent.mm",
|
122 |
+
"message_id": 331162269,
|
123 |
+
"user_id": 67619984,
|
124 |
+
"date": "2025-03-31T16:22:21",
|
125 |
+
"classification": "TruePositive"
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"sender": "ASAP Sample",
|
129 |
+
"content": "[3条]LJK86: 您的号卡已在配送途中,物流单号SF3147624215612,点击:http://t.hn.189.cn/EZv6bazD,查询物流进度。如已签收请在有效期内先实名激活。客服热线4008155555【中国电信】",
|
130 |
+
"app_name": "com.tencent.mm",
|
131 |
+
"message_id": 330225703,
|
132 |
+
"user_id": 74797059,
|
133 |
+
"date": "2025-03-30T22:57:51",
|
134 |
+
"classification": "TruePositive"
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"sender": "106875230196298038",
|
138 |
+
"content": "【京东快递】取件码C8042,您的快件尾号8980已送达成都东软学院配送营业点,地址:四川成都市都江堰市青城山镇东软大道1号东软学院电信营业厅旁 ,联系电话13032837084",
|
139 |
+
"app_name": "SMS",
|
140 |
+
"message_id": 328233028,
|
141 |
+
"user_id": 33642157,
|
142 |
+
"date": "2025-03-28T23:30:29",
|
143 |
+
"classification": "TruePositive"
|
144 |
+
},
|
145 |
+
{
|
146 |
+
"sender": "1068837016151",
|
147 |
+
"content": "【中通快递】73547529665397放在丰巢的包裹请及时取件,如有问题致电15900077340、020-22504077。",
|
148 |
+
"app_name": "SMS",
|
149 |
+
"message_id": 322223721,
|
150 |
+
"user_id": 67619984,
|
151 |
+
"date": "2025-03-22T22:37:50",
|
152 |
+
"classification": "TruePositive"
|
153 |
+
},
|
154 |
+
{
|
155 |
+
"sender": "ASAP Azure ToDoAgent",
|
156 |
+
"content": "陈格(alisagege.chen): WANG Siyu邀请你加入飞书视频会议\n会议主题:FilterLLM ToDoGenLLM PR Merge讨论\n会议时间:3月31日 (今天) 12:00 - 12:30 (GMT+8)\n会议 ID:656 445 907\n会议链接:https://vc.feishu.cn/j/656445907\n\n手机拨号一键入会\n+862122504720,,656445907(中国大陆)\n4008208888,,656445907(中国大陆)\n\n根据所在地拨打号...",
|
157 |
+
"app_name": "com.ss.android.lark",
|
158 |
+
"message_id": 331150106,
|
159 |
+
"user_id": 33642157,
|
160 |
+
"date": "2025-03-31T15:01:36",
|
161 |
+
"classification": "TruePositive"
|
162 |
+
},
|
163 |
+
{
|
164 |
+
"sender": "106917190196",
|
165 |
+
"content": "【广州自来水】(水费通知)尊敬的0001914043用户(大沙头三马路9号大院8号604)水量6(202503行度83),应缴水费11.88元,污水费5.7元。了解更多用水资讯可关注“广州自来水96968”微信公众号。",
|
166 |
+
"app_name": "SMS",
|
167 |
+
"message_id": 319155403,
|
168 |
+
"user_id": 67619984,
|
169 |
+
"date": "2025-03-19T15:54:16",
|
170 |
+
"classification": "TruePositive"
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"sender": "ASAP Sample",
|
174 |
+
"content": "AlisaGG陈G老师1010: 【招商银行】您本月办理分期还款可享限时2.3折优惠!打开掌上生活APP,搜“分期还款”立享折扣优惠!资格实时测评,拒收请回复R",
|
175 |
+
"app_name": "com.tencent.mm",
|
176 |
+
"message_id": 330231000,
|
177 |
+
"user_id": 67619984,
|
178 |
+
"date": "2025-03-30T23:10:59",
|
179 |
+
"classification": "TrueNegative"
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"sender": "ASAP Sample",
|
183 |
+
"content": "[4条]AlisaGG: 【招商银行】您本月办理分期还款可享限时2.3折优惠!打开掌上生活APP,搜“分期还款”立享折扣优惠!资格实时测评,拒收请回复R",
|
184 |
+
"app_name": "com.tencent.mm",
|
185 |
+
"message_id": 330231005,
|
186 |
+
"user_id": 74797059,
|
187 |
+
"date": "2025-03-30T23:11:00",
|
188 |
+
"classification": "TrueNegative"
|
189 |
+
},
|
190 |
+
]
|
LLM/Database/Messages.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
LLM/Database/README.md
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Messages.json
|
2 |
+
原始数据
|
3 |
+
## Manual_filtering_samples.json
|
4 |
+
手工筛的
|
5 |
+
## negative_samples.json
|
6 |
+
ai辅助筛的负样本
|
7 |
+
## positive_samples.json
|
8 |
+
ai辅助筛的正样本
|
LLM/Database/TrueNegative_samples.json
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"sender": "ASAP Sample",
|
4 |
+
"content": "AlisaGG陈G老师1010: 【招商银行】您本月办理分期还款可享限时2.3折优惠!打开掌上生活APP,搜“分期还款”立享折扣优惠!资格实时测评,拒收请回复R",
|
5 |
+
"app_name": "com.tencent.mm",
|
6 |
+
"message_id": 330231000,
|
7 |
+
"user_id": 67619984,
|
8 |
+
"date": "2025-03-30T23:10:59",
|
9 |
+
"classification": "TrueNegative"
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"sender": "ASAP Sample",
|
13 |
+
"content": "[4条]AlisaGG: 【招商银行】您本月办理分期还款可享限时2.3折优惠!打开掌上生活APP,搜“分期还款”立享折扣优惠!资格实时测评,拒收请回复R",
|
14 |
+
"app_name": "com.tencent.mm",
|
15 |
+
"message_id": 330231005,
|
16 |
+
"user_id": 74797059,
|
17 |
+
"date": "2025-03-30T23:11:00",
|
18 |
+
"classification": "TrueNegative"
|
19 |
+
},
|
20 |
+
{
|
21 |
+
"sender": "ASAP Azure ToDoAgent",
|
22 |
+
"content": "WANG Siyu: [图片]",
|
23 |
+
"app_name": "com.ss.android.lark",
|
24 |
+
"message_id": 401101707,
|
25 |
+
"user_id": 67619984,
|
26 |
+
"date": "2025-04-01T10:17:31"
|
27 |
+
},
|
28 |
+
{
|
29 |
+
"sender": "ASAP Azure ToDoAgent",
|
30 |
+
"content": "WANG Siyu: 我真的尝试给我们做一个公共邮箱 outlook🤮",
|
31 |
+
"app_name": "com.ss.android.lark",
|
32 |
+
"message_id": 401101708,
|
33 |
+
"user_id": 67619984,
|
34 |
+
"date": "2025-04-01T10:17:48"
|
35 |
+
},
|
36 |
+
{
|
37 |
+
"sender": "1065812627",
|
38 |
+
"content": "【流量领取提醒】尊敬的用户,您已获得10GB免费通用流量,将于2025年4月30日到期,点击:https://c.139.com/m/a/s?p=GDCM 或到“移动云盘App-我的-领奖专区-优惠卡券”领取,数量有限,领完即止。拒收请回复R【中国移动云盘】",
|
39 |
+
"app_name": "SMS",
|
40 |
+
"message_id": 401101418,
|
41 |
+
"user_id": 74797059,
|
42 |
+
"date": "2025-04-01T10:14:54"
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"sender": "106575329994",
|
46 |
+
"content": "【霸王茶姬】好茶一口鲜,唤醒好状态,送你「整单8折+双杯78券」3ct.cc/e1mwQaSz 仅限广东区域部分门店核销 拒收请回复R",
|
47 |
+
"app_name": "SMS",
|
48 |
+
"message_id": 401100317,
|
49 |
+
"user_id": 74797059,
|
50 |
+
"date": "2025-04-01T10:03:40"
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"sender": "10086",
|
54 |
+
"content": "尊敬的客户,根据您的充值历史记录赠送6.00元,您总共还有待赠送金额60.00元,其中下月将到帐6.00元,当前可用余额73.51元,感谢您对我公司的支持。登录“中国移动”APP足不出户查办业务,尊享充值优惠,立即前往: https://dx.10086.cn/A/kHXkEQ 。【中国移动】",
|
55 |
+
"app_name": "SMS",
|
56 |
+
"message_id": 401090015,
|
57 |
+
"user_id": 74797059,
|
58 |
+
"date": "2025-04-01T09:00:45"
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"sender": "106980095533",
|
62 |
+
"content": "【建设银行】建行送福利啦!支付1分钱,有机会抽取2-188元微信立减金,100%中奖!手机银行APP首页搜索进入“建粤有礼”,点击“月月有礼”即可参与活动。如已参与,请忽略本条短信,详询广州建行各网点大堂经理。拒收请回复R",
|
63 |
+
"app_name": "SMS",
|
64 |
+
"message_id": 329111118,
|
65 |
+
"user_id": 67619984,
|
66 |
+
"date": "2025-03-29T11:11:51"
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"sender": "1068518895595",
|
70 |
+
"content": "【光大银行】尊敬的光大信用卡受邀客户!欢迎参与4月积分福利大放送活动。2025年4月1日至30日累计消费满200元额外获赠1千积分,累计消费满800元再赠4千积分,累计消费满3千元叠加获赠1万积分!最高奖励1.5万积分,有效期1年。详情可咨询95595 。*拒收请回复R",
|
71 |
+
"app_name": "SMS",
|
72 |
+
"message_id": 328124936,
|
73 |
+
"user_id": 67619984,
|
74 |
+
"date": "2025-03-28T12:49:56"
|
75 |
+
},
|
76 |
+
{
|
77 |
+
"sender": "备忘录",
|
78 |
+
"content": "[4条]M丶D: 【开心麻花】开心麻花推出“降温补贴票”!《贼想得到你》新开场次买一送一,优惠限时到气温恢复15°C!抢票戳:kxmh.net/3Eo9。拒收请回复R",
|
79 |
+
"app_name": "com.tencent.mm",
|
80 |
+
"message_id": 328111725,
|
81 |
+
"user_id": 67619984,
|
82 |
+
"date": "2025-03-28T11:17:56"
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"sender": "10086109",
|
86 |
+
"content": "春风送暖绿意浓,森林防火莫放松!清明祭祀倡新风,鲜花追思情更重;踏青旅游守规章,莫留火种在山中。严防森林火灾,永葆绿色家园。(市规划资源局)",
|
87 |
+
"app_name": "SMS",
|
88 |
+
"message_id": 328105821,
|
89 |
+
"user_id": 67619984,
|
90 |
+
"date": "2025-03-28T10:58:47"
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"sender": "10680175780000628",
|
94 |
+
"content": "【养生课】派送中:通知您数次了,您的免费中医养生课程已经送到,请及时通过 b.i7o.cn/0B7YCY1 获取,拒收请回复R",
|
95 |
+
"app_name": "SMS",
|
96 |
+
"message_id": 328100301,
|
97 |
+
"user_id": 67619984,
|
98 |
+
"date": "2025-03-28T10:03:20"
|
99 |
+
},
|
100 |
+
{
|
101 |
+
"sender": "ASAP Azure ToDoAgent",
|
102 |
+
"content": "WANG Siyu: Message_id: 1, content:“ 【中国联通】出境漫游低至19.5元/天,订购天数越多越优惠,覆盖全球106个国家和地区,一次订购畅游全球!即日起至12月31日,订购国漫畅游流量3天流量包及以上天数产品,即可获赠权益六选一通兑券一张,打车、外卖、美食代金券等你来拿。订购方式点击https://u.10010.cn/uAbdw 或发送KTGM10至10010,根据回复提示进行订购,更多出境漫游流量产品请点击https://u.10010.cn...",
|
103 |
+
"app_name": "com.ss.android.lark",
|
104 |
+
"message_id": 327121407,
|
105 |
+
"user_id": 33642157,
|
106 |
+
"date": "2025-03-27T12:14:17"
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"sender": "10086",
|
110 |
+
"content": "【防诈温馨提醒】尊敬的客户,惠州移动提醒您:“不明来电及时挂”、“可疑链接不点击”,请您对各类调研评分、评价有奖等陌生来电谨慎接听,谨防电信网络诈骗。您可回复KTFSR办理高频骚扰电话拦截服务,该服务无需收费,办理后如需取消请发送QXFSR到10086。点击 https://dx.10086.cn/A/tx04HA 进入“中国移动”APP服务大厅,可清晰查阅已办理业务,诚邀体验。心级服务、让爱连接,我们期待您的“10分”满意。【中国移动】",
|
111 |
+
"app_name": "SMS",
|
112 |
+
"message_id": 327111797,
|
113 |
+
"user_id": 74797059,
|
114 |
+
"date": "2025-03-27T11:17:29"
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"sender": "🌳广州新地标!云山天地",
|
118 |
+
"content": "白云山南门旁,美食购物一站式,负一层还有儿童乐园,逛完白云山来歇歇脚吧!",
|
119 |
+
"app_name": "com.dianping.v1",
|
120 |
+
"message_id": 327085555,
|
121 |
+
"user_id": 67619984,
|
122 |
+
"date": "2025-03-27T08:55:56"
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"sender": "四川一小区疑高空抛物砸死快递员",
|
126 |
+
"content": "点击查看详情>>>",
|
127 |
+
"app_name": "com.sina.weibo",
|
128 |
+
"message_id": 327083854,
|
129 |
+
"user_id": 67619984,
|
130 |
+
"date": "2025-03-27T08:38:49"
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"sender": "男子为发明永动机出租屋连杀5人",
|
134 |
+
"content": "男子为发明永动机,想通过杀人锻炼胆量,在出租屋内连杀5人,被判处死刑,目前案件正在二审审理期间。",
|
135 |
+
"app_name": "com.ss.android.ugc.aweme",
|
136 |
+
"message_id": 327083851,
|
137 |
+
"user_id": 67619984,
|
138 |
+
"date": "2025-03-27T08:38:49"
|
139 |
+
},
|
140 |
+
{
|
141 |
+
"sender": "福利红包天天抢,最高666元",
|
142 |
+
"content": "机会难得,不要错过>>",
|
143 |
+
"app_name": "com.taobao.taobao",
|
144 |
+
"message_id": 327083853,
|
145 |
+
"user_id": 67619984,
|
146 |
+
"date": "2025-03-27T08:38:49"
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"sender": "106980095533",
|
150 |
+
"content": "【建设银行】您账户8699于3月27日8时3分向微信支付-羊城通缴费支出人民币3.5元,可用余额5240.93元。",
|
151 |
+
"app_name": "SMS",
|
152 |
+
"message_id": 327080344,
|
153 |
+
"user_id": 67619984,
|
154 |
+
"date": "2025-03-27T08:03:48"
|
155 |
+
},
|
156 |
+
{
|
157 |
+
"sender": "恭喜,你被喜马选中做有声书副业!",
|
158 |
+
"content": "根据收听记录,你被认定适合喜马的声音副业,下班后用1-2个小时录制时薪书、分成书上传就可能获得收获!",
|
159 |
+
"app_name": "com.ximalaya.ting.android",
|
160 |
+
"message_id": 326202922,
|
161 |
+
"user_id": 67619984,
|
162 |
+
"date": "2025-03-26T20:29:01"
|
163 |
+
},
|
164 |
+
{
|
165 |
+
"sender": "备忘录",
|
166 |
+
"content": "[14条]M丶D: [文件] WEMEC METABOTS R60(1).pdf",
|
167 |
+
"app_name": "com.tencent.mm",
|
168 |
+
"message_id": 326183714,
|
169 |
+
"user_id": 67619984,
|
170 |
+
"date": "2025-03-26T18:37:48"
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"sender": "你的兴趣商品",
|
174 |
+
"content": "JETSON ORIN NX Super 开发套件 AI 智能模组orin nx 主板限时优惠发放中!",
|
175 |
+
"app_name": "com.taobao.taobao",
|
176 |
+
"message_id": 326190015,
|
177 |
+
"user_id": 67619984,
|
178 |
+
"date": "2025-03-26T19:00:09"
|
179 |
+
},
|
180 |
+
{
|
181 |
+
"sender": "恭喜您被666元红包选中啦!",
|
182 |
+
"content": "好友***已成功领取,抓紧参与吧!",
|
183 |
+
"app_name": "com.taobao.taobao",
|
184 |
+
"message_id": 326190116,
|
185 |
+
"user_id": 67619984,
|
186 |
+
"date": "2025-03-26T19:01:14"
|
187 |
+
},
|
188 |
+
{
|
189 |
+
"sender": "106980095533",
|
190 |
+
"content": "【建设银行】您账户8699于3月26日19时8分向微信支付-羊城通缴费支出人民币1元,可用余额5263.83元。",
|
191 |
+
"app_name": "SMS",
|
192 |
+
"message_id": 326190917,
|
193 |
+
"user_id": 67619984,
|
194 |
+
"date": "2025-03-26T19:09:07"
|
195 |
+
},
|
196 |
+
|
197 |
+
]
|
LLM/Database/TruePositive_samples.json
ADDED
@@ -0,0 +1,270 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"sender": "ASAP Sample",
|
4 |
+
"content": "王斯煜[表情]Vince 黑客松nv: @AlisaGG @LJK86 明天中午再对 WANG Siyu邀请你加入飞书视频会议\n会议主题:FilterLLM ToDoGenLLM PR Merge讨论\n会议时间:3月31日 (今天) 12:00 - 12:30 (GMT+8)\n会议 ID:656 445 907\n会议链接:https://...",
|
5 |
+
"app_name": "com.tencent.mm",
|
6 |
+
"message_id": 331001504,
|
7 |
+
"user_id": 67619984,
|
8 |
+
"date": "2025-03-31T00:15:21",
|
9 |
+
"classification": "TruePositive"
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"sender": "ASAP Sample",
|
13 |
+
"content": "[6条]王斯煜[表情]Vince 黑客松nv: Your Mint 3-Month, Unlimited plan expires in 2 days. Log into your account at my.mintmobile.com or via our app to make a payment and keep your Mint se...",
|
14 |
+
"app_name": "com.tencent.mm",
|
15 |
+
"message_id": 331002905,
|
16 |
+
"user_id": 67619984,
|
17 |
+
"date": "2025-03-31T00:29:45",
|
18 |
+
"classification": "TruePositive"
|
19 |
+
},
|
20 |
+
{
|
21 |
+
"sender": "ASAP Sample",
|
22 |
+
"content": "[4条]斯煜[表情]Vince: 【美团月付】您2月账单805.57元待还,最后还款日为本月22号,查账或立即还款点 dpurl.cn/HXFUxEba",
|
23 |
+
"app_name": "com.tencent.mm",
|
24 |
+
"message_id": 331002907,
|
25 |
+
"user_id": 74797059,
|
26 |
+
"date": "2025-03-31T00:29:45",
|
27 |
+
"classification": "TruePositive"
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"sender": "ASAP Sample",
|
31 |
+
"content": "[4条]斯煜[表情]Vince: 【中通快递】73547529665397放在丰巢的包裹请及时取件,如有问题致电15900077340、020-22504077。",
|
32 |
+
"app_name": "com.tencent.mm",
|
33 |
+
"message_id": 331002908,
|
34 |
+
"user_id": 74797059,
|
35 |
+
"date": "2025-03-31T00:29:45",
|
36 |
+
"classification": "TruePositive"
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"sender": "ASAP Sample",
|
40 |
+
"content": "[6条]斯煜[表情]Vince: 尊敬的*斯煜,您在我行办理的1笔个人贷款需于2024年12月08日17:00前还款,当期还款金额本息合计999999.84元,请您留意尾号0455的账户可用余额是否充足,避免因贷款逾期影响个人征信。具体贷款信息可通过工行网上银行、手机银行或致电贷款经办行查询。【工商银行】",
|
41 |
+
"app_name": "com.tencent.mm",
|
42 |
+
"message_id": 331002909,
|
43 |
+
"user_id": 74797059,
|
44 |
+
"date": "2025-03-31T00:29:45",
|
45 |
+
"classification": "TruePositive"
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"sender": "ASAP Sample",
|
49 |
+
"content": "[6条]斯煜[表情]Vince: Your Mint 3-Month, Unlimited plan expires in 2 days. Log into your account at my.mintmobile.com or via our app to make a payment and keep your Mint se...",
|
50 |
+
"app_name": "com.tencent.mm",
|
51 |
+
"message_id": 331002910,
|
52 |
+
"user_id": 74797059,
|
53 |
+
"date": "2025-03-31T00:29:45",
|
54 |
+
"classification": "TruePositive"
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"sender": "ASAP Sample",
|
58 |
+
"content": "王斯煜[表情]Vince 黑客松nv: WANG Siyu邀请你加入飞书视频会议\n会议主题:FilterLLM ToDoGenLLM PR Merge讨论\n会议时间:3月31日 (今天) 12:00 - 12:30 (GMT+8)\n会议 ID:656 445 907\n会议链接:https://vc.feishu.cn/j/65644590...",
|
59 |
+
"app_name": "com.tencent.mm",
|
60 |
+
"message_id": 331115637,
|
61 |
+
"user_id": 67619984,
|
62 |
+
"date": "2025-03-31T11:56:36",
|
63 |
+
"classification": "TruePositive"
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"sender": "ASAP Sample",
|
67 |
+
"content": "AlisaGG陈G老师1010: 【驿收发】您的邮政包裹已到凯丰花园2栋驿站,请23:00前凭5-5-6530来取,详询18320926368",
|
68 |
+
"app_name": "com.tencent.mm",
|
69 |
+
"message_id": 331133944,
|
70 |
+
"user_id": 67619984,
|
71 |
+
"date": "2025-03-31T13:39:49",
|
72 |
+
"classification": "TruePositive"
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"sender": "ASAP Sample",
|
76 |
+
"content": "[2条]AlisaGG: 【小象超市】您好,我是小象超市骑手,【美团智能外卖柜】您的外卖已送至新一代产业园2栋4号柜(面向马路侧),格口号:27,取件码:9310,存柜超过12小时将被清理,请及时取件。",
|
77 |
+
"app_name": "com.tencent.mm",
|
78 |
+
"message_id": 331134011,
|
79 |
+
"user_id": 74797059,
|
80 |
+
"date": "2025-03-31T13:40:54"
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"sender": "ASAP Sample",
|
84 |
+
"content": "[2条]AlisaGG陈G老师1010: 【小象超市】您好,我是小象超市骑手,【美团智能外卖柜】您的外卖已送至新一代产业园2栋4号柜(面向马路侧),格口号:27,取件码:9310,存柜超过12小时将被清理,请及时取件。",
|
85 |
+
"app_name": "com.tencent.mm",
|
86 |
+
"message_id": 331134045,
|
87 |
+
"user_id": 67619984,
|
88 |
+
"date": "2025-03-31T13:40:54",
|
89 |
+
"classification": "TruePositive"
|
90 |
+
},
|
91 |
+
{
|
92 |
+
"sender": "ASAP Sample",
|
93 |
+
"content": "[3条]AlisaGG陈G老师1010: 取件通知\n取件码: 11724837\n运单号: 464285154986072\n取件地址: 深圳新一代产业园P2出入口内侧4号丰巢柜\n配送人员: 18124519013\n计费规则: 查看详情",
|
94 |
+
"app_name": "com.tencent.mm",
|
95 |
+
"message_id": 331134346,
|
96 |
+
"user_id": 67619984,
|
97 |
+
"date": "2025-03-31T13:43:17",
|
98 |
+
"classification": "TruePositive"
|
99 |
+
},
|
100 |
+
{
|
101 |
+
"sender": "ASAP Sample",
|
102 |
+
"content": "[4条]AlisaGG陈G老师1010: 取件再次提醒\n取件码:: 27696201\n配送公司:: 申通快递\n运单号:: 777293635831671\n配送���手机:: 13392809673\n取件地址:: 深圳新一代产业园P2出入口内侧4号丰巢柜",
|
103 |
+
"app_name": "com.tencent.mm",
|
104 |
+
"message_id": 331134447,
|
105 |
+
"user_id": 67619984,
|
106 |
+
"date": "2025-03-31T13:44:02",
|
107 |
+
"classification": "TruePositive"
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"sender": "ASAP Sample",
|
111 |
+
"content": "[5条]李JK老师-1228: 【菜鸟驿站】请凭140-3-1019到菜鸟驿站取件,查询详情u.cainiao.com/53h4bSr7zrh",
|
112 |
+
"app_name": "com.tencent.mm",
|
113 |
+
"message_id": 331135751,
|
114 |
+
"user_id": 67619984,
|
115 |
+
"date": "2025-03-31T13:57:28",
|
116 |
+
"classification": "TruePositive"
|
117 |
+
},
|
118 |
+
{
|
119 |
+
"sender": "ASAP Sample",
|
120 |
+
"content": "刻刻: 【停机前提醒】尊敬的移动客户,您好!您的账户余额不足被限制使用。现提醒您需充值缴费至少32.49元,以确保您继续享受畅通的通信服务。诚邀您一键办理自动充服务,自动充值缴费更轻松:https://dx.10086.cn/7WyRLA。 心级服务、让爱连接【中国移动】",
|
121 |
+
"app_name": "com.tencent.mm",
|
122 |
+
"message_id": 331162269,
|
123 |
+
"user_id": 67619984,
|
124 |
+
"date": "2025-03-31T16:22:21",
|
125 |
+
"classification": "TruePositive"
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"sender": "ASAP Sample",
|
129 |
+
"content": "[3条]LJK86: 您的号卡已在配送途中,物流单号SF3147624215612,点击:http://t.hn.189.cn/EZv6bazD,查询物流进度。如已签收请在有效期内先实名激活。客服热线4008155555【中国电信】",
|
130 |
+
"app_name": "com.tencent.mm",
|
131 |
+
"message_id": 330225703,
|
132 |
+
"user_id": 74797059,
|
133 |
+
"date": "2025-03-30T22:57:51",
|
134 |
+
"classification": "TruePositive"
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"sender": "106875230196298038",
|
138 |
+
"content": "【京东快递】取件码C8042,您的快件尾号8980已送达成都东软学院配送营业点,地址:四川成都市都江堰市青城山镇东软大道1号东软学院电信营业厅旁 ,联系电话13032837084",
|
139 |
+
"app_name": "SMS",
|
140 |
+
"message_id": 328233028,
|
141 |
+
"user_id": 33642157,
|
142 |
+
"date": "2025-03-28T23:30:29",
|
143 |
+
"classification": "TruePositive"
|
144 |
+
},
|
145 |
+
{
|
146 |
+
"sender": "1068837016151",
|
147 |
+
"content": "【中通快递】73547529665397放在丰巢的包裹请及时取件,如有问题致电15900077340、020-22504077。",
|
148 |
+
"app_name": "SMS",
|
149 |
+
"message_id": 322223721,
|
150 |
+
"user_id": 67619984,
|
151 |
+
"date": "2025-03-22T22:37:50",
|
152 |
+
"classification": "TruePositive"
|
153 |
+
},
|
154 |
+
{
|
155 |
+
"sender": "ASAP Azure ToDoAgent",
|
156 |
+
"content": "陈格(alisagege.chen): WANG Siyu邀请你加入飞书视频会议\n会议主题:FilterLLM ToDoGenLLM PR Merge讨论\n会议时间:3月31日 (今天) 12:00 - 12:30 (GMT+8)\n会议 ID:656 445 907\n会议链接:https://vc.feishu.cn/j/656445907\n\n手机拨号一键入会\n+862122504720,,656445907(中国大陆)\n4008208888,,656445907(中国大陆)\n\n根据所在地拨打号...",
|
157 |
+
"app_name": "com.ss.android.lark",
|
158 |
+
"message_id": 331150106,
|
159 |
+
"user_id": 33642157,
|
160 |
+
"date": "2025-03-31T15:01:36",
|
161 |
+
"classification": "TruePositive"
|
162 |
+
},
|
163 |
+
{
|
164 |
+
"sender": "106917190196",
|
165 |
+
"content": "【广州自来水】(水费通知)尊敬的0001914043用户(大沙头三马路9号大院8号604)水量6(202503行度83),应缴水费11.88元,污水费5.7元。了解更多用水资讯可关注“广州自来水96968”微信公众号。",
|
166 |
+
"app_name": "SMS",
|
167 |
+
"message_id": 319155403,
|
168 |
+
"user_id": 67619984,
|
169 |
+
"date": "2025-03-19T15:54:16",
|
170 |
+
"classification": "TruePositive"
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"sender": "ASAP Sample",
|
174 |
+
"content": "AlisaGG: 【小象超市】您的商品已放置在门口,因有易碎等商品请尽快取回,如有疑问请联系15794935204 。祝您生活愉快!",
|
175 |
+
"app_name": "com.tencent.mm",
|
176 |
+
"message_id": 331232914,
|
177 |
+
"user_id": 74797059,
|
178 |
+
"date": "2025-03-31T23:29:47"
|
179 |
+
},
|
180 |
+
{
|
181 |
+
"sender": "ASAP Sample",
|
182 |
+
"content": "AlisaGG陈G老师1010: 【小象超市】您的商品已放置在门口,因有易碎等商品请尽快取回,如有疑问请联系15794935204 。祝您生活愉快!",
|
183 |
+
"app_name": "com.tencent.mm",
|
184 |
+
"message_id": 331232986,
|
185 |
+
"user_id": 67619984,
|
186 |
+
"date": "2025-03-31T23:29:46"
|
187 |
+
},
|
188 |
+
{
|
189 |
+
"sender": "ASAP Sample",
|
190 |
+
"content": "[3条]AlisaGG: 取件通知\n取件码: 11724837\n运单号: 464285154986072\n取件地址: 深圳新一代产业园P2出入口内侧4号丰巢柜\n配送人员: 18124519013\n计费规则: 查看详情",
|
191 |
+
"app_name": "com.tencent.mm",
|
192 |
+
"message_id": 331150113,
|
193 |
+
"user_id": 33642157,
|
194 |
+
"date": "2025-03-31T15:01:37"
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"sender": "ASAP Sample",
|
198 |
+
"content": "[4条]AlisaGG: 取件再次提醒\n取件码:: 27696201\n配送公司:: 申通快递\n运单号:: 777293635831671\n配送员手机:: 13392809673\n取件地址:: 深圳新一代产业园P2出入口内侧4号丰巢柜",
|
199 |
+
"app_name": "com.tencent.mm",
|
200 |
+
"message_id": 331150114,
|
201 |
+
"user_id": 33642157,
|
202 |
+
"date": "2025-03-31T15:01:37"
|
203 |
+
},
|
204 |
+
{
|
205 |
+
"sender": "ASAP Sample",
|
206 |
+
"content": "AlisaGG: 【驿收发】您的邮政包裹已到凯丰花园2栋驿站,请23:00前凭5-5-6530来取,详询18320926368",
|
207 |
+
"app_name": "com.tencent.mm",
|
208 |
+
"message_id": 331150111,
|
209 |
+
"user_id": 33642157,
|
210 |
+
"date": "2025-03-31T15:01:36"
|
211 |
+
},
|
212 |
+
{
|
213 |
+
"sender": "ASAP Sample",
|
214 |
+
"content": "[2条]AlisaGG: 【小象超市】您好,我是小象超市骑手,【美团智能外卖柜】您的外卖已送至新一代产业园2栋4号柜(面向马路侧),格口号:27,取件码:9310,存柜超过12小时将被清理,请及时取件。",
|
215 |
+
"app_name": "com.tencent.mm",
|
216 |
+
"message_id": 331150112,
|
217 |
+
"user_id": 33642157,
|
218 |
+
"date": "2025-03-31T15:01:37"
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"sender": "ASAP Sample",
|
222 |
+
"content": "斯煜[表情]Vince: WANG Siyu邀请你加入飞书视频会议\n会议主题:FilterLLM ToDoGenLLM PR Merge讨论\n会议时间:3月31日 (今天) 12:00 - 12:30 (GMT+8)\n会议 ID:656 445 907\n会议链接:https://vc.feishu.cn/j/65644590...",
|
223 |
+
"app_name": "com.tencent.mm",
|
224 |
+
"message_id": 331150102,
|
225 |
+
"user_id": 33642157,
|
226 |
+
"date": "2025-03-31T15:01:37"
|
227 |
+
},
|
228 |
+
{
|
229 |
+
"sender": "#SoSIM",
|
230 |
+
"content": "[SoSIM提示] 未來30日自動續購服務收費 (04/2025)\r\n\r\n為確保你現有的服務不會中斷,根據系統紀錄,以下是你未來30日已開啟自動續購的服務收費總額,只供參考。如果此額多於「現有儲值額」,建議立即增值。\r\n\r\n\r\n儲值卡號碼:96334767\r\n\r\n未來30日續購服務總額 : $55.0\r\n\r\n現有儲值額 : $26.65\r\n\r\n注意:個別服務 (包括內地副號及附屬卡) 一旦到期而未有足夠餘額自動續購,有關號碼將即時失效而不可復原。\r\n\r\n請於賬戶主頁 > 設定 內瀏覽有關服務組合有效期及詳情。\r\n\r\n\r\n***如果你的「自動增值」金額不少於所需服務收費,請忽略此訊息***",
|
231 |
+
"app_name": "SMS",
|
232 |
+
"message_id": 401100763,
|
233 |
+
"user_id": 96989258,
|
234 |
+
"date": "2025-04-01T10:07:29"
|
235 |
+
},
|
236 |
+
{
|
237 |
+
"sender": "106875230196298038",
|
238 |
+
"content": "【京东快递】取件码C8042,您的快件尾号8980已送达成都东软学院配送营业点,地址:四川成都市都江堰市青城山镇东软大道1号东软学院电信营业厅旁 ,联系电话13032837084",
|
239 |
+
"app_name": "SMS",
|
240 |
+
"message_id": 328233028,
|
241 |
+
"user_id": 33642157,
|
242 |
+
"date": "2025-03-28T23:30:29"
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"sender": "您购买的宝贝已送达自提柜",
|
246 |
+
"content": "包裹待签收,点击查看详情>>",
|
247 |
+
"app_name": "com.taobao.taobao",
|
248 |
+
"message_id": 327135963,
|
249 |
+
"user_id": 67619984,
|
250 |
+
"date": "2025-03-27T13:59:30"
|
251 |
+
},
|
252 |
+
{
|
253 |
+
"sender": "您购买的宝贝正在派送中",
|
254 |
+
"content": "【搓澡巾洗澡神器男女款】预计送往【智能柜】,手机号已加密,享号码保护服务,点击查看预计送达时间>>",
|
255 |
+
"app_name": "com.taobao.taobao",
|
256 |
+
"message_id": 327091156,
|
257 |
+
"user_id": 67619984,
|
258 |
+
"date": "2025-03-27T09:11:37"
|
259 |
+
},
|
260 |
+
{
|
261 |
+
"sender": "#SoSIM",
|
262 |
+
"content": "[SoSIM提示] 30日/無限影視數據組合 (由2025年1月7日起續購收費$55/30日) 用量提示:\r\n儲值卡號碼:96334767\r\n累積用量:30 GB \r\n公平使用數據用量上限:已用50% (截至:27/03/2025 06:53:27)\r\n如已達到 / 超過公平使用數據用量100%,數據傳輸速度將會即時被限制至高達128kbps,直至 25/04/2025 19:42:52 (香港時間)。\r\n立即於賬戶主頁 > 本地服務 https://sosimhk.com/SoSim/main/tc/topup 購買追加數據以保持流暢的上網速度。\r\n",
|
263 |
+
"app_name": "SMS",
|
264 |
+
"message_id": 327065312,
|
265 |
+
"user_id": 96989258,
|
266 |
+
"date": "2025-03-27T06:53:42"
|
267 |
+
},
|
268 |
+
|
269 |
+
|
270 |
+
]
|
LLM/Database/classify_samples.py
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import csv
|
3 |
+
import os
|
4 |
+
import pandas as pd
|
5 |
+
import openai
|
6 |
+
import time
|
7 |
+
import requests
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
from tqdm import tqdm
|
10 |
+
|
11 |
+
# 加载环境变量(如果有.env文件)
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
# 配置SiliconFlow API
|
15 |
+
SILICONFLOW_API_KEY = os.getenv("SILICONFLOW_API_KEY", "sk-ypjvmantsostdxrkirhidrtswohjpmlzuhyqojpudbreakwk")
|
16 |
+
SILICONFLOW_API_BASE = os.getenv("SILICONFLOW_API_BASE", "https://api.siliconflow.cn/v1")
|
17 |
+
|
18 |
+
# 保留OpenAI API配置(作为备选)
|
19 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "your_openai_api_key_here")
|
20 |
+
openai.api_key = OPENAI_API_KEY
|
21 |
+
|
22 |
+
# 可以配置为Azure OpenAI
|
23 |
+
#AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "")
|
24 |
+
#AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", "")
|
25 |
+
#AZURE_DEPLOYMENT_NAME = os.getenv("AZURE_DEPLOYMENT_NAME", "")
|
26 |
+
|
27 |
+
# 获取Azure配置参数
|
28 |
+
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "")
|
29 |
+
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", "")
|
30 |
+
AZURE_DEPLOYMENT_NAME = os.getenv("AZURE_DEPLOYMENT_NAME", "")
|
31 |
+
|
32 |
+
# 如果有Azure OpenAI配置,则使用Azure OpenAI
|
33 |
+
if AZURE_OPENAI_ENDPOINT.strip() and AZURE_OPENAI_API_KEY.strip() and AZURE_DEPLOYMENT_NAME.strip():
|
34 |
+
openai.api_type = "azure"
|
35 |
+
openai.api_base = AZURE_OPENAI_ENDPOINT
|
36 |
+
openai.api_key = AZURE_OPENAI_API_KEY
|
37 |
+
openai.api_version = "2023-05-15" # 可能需要根据实际情况调整
|
38 |
+
|
39 |
+
# 定义TruePositive的标准(根据mvp三类案例)
|
40 |
+
def define_positive_sample_criteria():
|
41 |
+
"""
|
42 |
+
定义TruePositive的标准
|
43 |
+
根据搜索结果,TruePositive被定义为"mvp三类案例",但没有找到具体定义
|
44 |
+
这里我们定义一些可能的标准,实际使用时可以根据需求调整
|
45 |
+
"""
|
46 |
+
return """
|
47 |
+
请判断以下消息是否属于TruePositive。TruePositive定义为与任务管理、待办事项、提醒、通知筛选相关的有用信息,具体包括:
|
48 |
+
1. 包含明确的任务、待办事项或需要完成的工作
|
49 |
+
2. 包含时间安排、截止日期或日程提醒
|
50 |
+
3. 包含项目进展、状态更新或工作报告
|
51 |
+
|
52 |
+
如果消息符合以上任一条件,则为TruePositive;否则为TrueNegative。
|
53 |
+
请只回答"TruePositive"或"TrueNegative"。
|
54 |
+
"""
|
55 |
+
|
56 |
+
# 使用大模型API进行分类
|
57 |
+
def classify_with_llm(message, criteria, max_retries=3, retry_delay=2):
|
58 |
+
"""
|
59 |
+
使用大模型API对消息进行分类
|
60 |
+
|
61 |
+
Args:
|
62 |
+
message: 要分类的消息内容
|
63 |
+
criteria: 分类标准
|
64 |
+
max_retries: 最大重试次数
|
65 |
+
retry_delay: 重试延迟(秒)
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
str: "TruePositive" 或 "TrueNegative"
|
69 |
+
"""
|
70 |
+
prompt = f"{criteria}\n\n消息内容: {message}"
|
71 |
+
system_message = "你是一个专业的数据分类助手,根据给定标准判断消息是TruePositive还是TrueNegative。"
|
72 |
+
|
73 |
+
for attempt in range(max_retries):
|
74 |
+
try:
|
75 |
+
# 使用SiliconFlow API
|
76 |
+
headers = {
|
77 |
+
"Content-Type": "application/json",
|
78 |
+
"Authorization": f"Bearer {SILICONFLOW_API_KEY}"
|
79 |
+
}
|
80 |
+
|
81 |
+
payload = {
|
82 |
+
"model": "deepseek-ai/DeepSeek-V3",
|
83 |
+
"messages": [
|
84 |
+
{"role": "system", "content": system_message},
|
85 |
+
{"role": "user", "content": prompt}
|
86 |
+
],
|
87 |
+
"stream": False,
|
88 |
+
"max_tokens": 512,
|
89 |
+
"temperature": 0.1,
|
90 |
+
"top_p": 0.7,
|
91 |
+
"top_k": 50,
|
92 |
+
"frequency_penalty": 0.5,
|
93 |
+
"n": 1
|
94 |
+
}
|
95 |
+
|
96 |
+
response = requests.post(
|
97 |
+
f"{SILICONFLOW_API_BASE}/chat/completions",
|
98 |
+
headers=headers,
|
99 |
+
json=payload
|
100 |
+
)
|
101 |
+
|
102 |
+
# 检查响应状态
|
103 |
+
response.raise_for_status()
|
104 |
+
response_data = response.json()
|
105 |
+
|
106 |
+
# 解析响应
|
107 |
+
result = response_data["choices"][0]["message"]["content"].strip()
|
108 |
+
|
109 |
+
# 标准化结果
|
110 |
+
if "TruePositive" in result:
|
111 |
+
return "TruePositive"
|
112 |
+
else:
|
113 |
+
return "TrueNegative"
|
114 |
+
|
115 |
+
except Exception as e:
|
116 |
+
if attempt < max_retries - 1:
|
117 |
+
print(f"API调用失败,{retry_delay}秒后重试: {e}\n响应状态码: {response.status_code if 'response' in locals() else 'N/A'}\n响应内容: {response.text if 'response' in locals() else 'N/A'}")
|
118 |
+
time.sleep(retry_delay)
|
119 |
+
else:
|
120 |
+
print(f"API调用失败,达到最大重试次数: {e}\n最后响应状态码: {response.status_code if 'response' in locals() else 'N/A'}\n最后响应内容: {response.text if 'response' in locals() else 'N/A'}")
|
121 |
+
return "分类失败" # 返回一个默认值
|
122 |
+
|
123 |
+
# 批量处理消息
|
124 |
+
def batch_process_messages(messages, batch_size=10, delay=1):
|
125 |
+
"""
|
126 |
+
批量处理消息以避免API限制
|
127 |
+
|
128 |
+
Args:
|
129 |
+
messages: 消息列表
|
130 |
+
batch_size: 每批处理的消息数量
|
131 |
+
delay: 批次间延迟(秒)
|
132 |
+
|
133 |
+
Returns:
|
134 |
+
list: 处理结果列表
|
135 |
+
"""
|
136 |
+
results = []
|
137 |
+
criteria = define_positive_sample_criteria()
|
138 |
+
|
139 |
+
for i in tqdm(range(0, len(messages), batch_size), desc="处理批次"):
|
140 |
+
batch = messages[i:i+batch_size]
|
141 |
+
batch_results = []
|
142 |
+
|
143 |
+
for msg in tqdm(batch, desc="处理消息", leave=False):
|
144 |
+
# 只处理有实际内容的消息
|
145 |
+
if msg.get("content") and len(msg["content"]) > 5: # 忽略过短的消息
|
146 |
+
classification = classify_with_llm(msg["content"], criteria)
|
147 |
+
msg["classification"] = classification
|
148 |
+
else:
|
149 |
+
msg["classification"] = "TrueNegative" # 默认短消息为TrueNegative
|
150 |
+
|
151 |
+
batch_results.append(msg)
|
152 |
+
|
153 |
+
results.extend(batch_results)
|
154 |
+
|
155 |
+
if i + batch_size < len(messages):
|
156 |
+
time.sleep(delay) # 批次间延迟
|
157 |
+
|
158 |
+
return results
|
159 |
+
|
160 |
+
# 主函数
|
161 |
+
def main():
|
162 |
+
# 检查API密钥是否配置
|
163 |
+
if SILICONFLOW_API_KEY == "":
|
164 |
+
print("警告: 未设置SiliconFlow API密钥。请设置环境变量SILICONFLOW_API_KEY或在代码中直接设置。")
|
165 |
+
return
|
166 |
+
|
167 |
+
# 确定输入文件
|
168 |
+
input_file = "Messages.json" # 默认使用JSON格式
|
169 |
+
if not os.path.exists(input_file):
|
170 |
+
print(f"错误: 找不到JSON输入文件 {input_file}")
|
171 |
+
return
|
172 |
+
|
173 |
+
print(f"使用输入文件: {input_file}")
|
174 |
+
|
175 |
+
# 读取数据
|
176 |
+
messages = []
|
177 |
+
if input_file.endswith(".json"):
|
178 |
+
with open(input_file, "r", encoding="utf-8") as f:
|
179 |
+
messages = json.load(f)
|
180 |
+
|
181 |
+
|
182 |
+
print(f"读取了 {len(messages)} 条消息")
|
183 |
+
|
184 |
+
# 询问用户是否要处理所有消息或仅处理一部分样本
|
185 |
+
sample_size = input("请输入要处理的消息数量(输入'all'处理所有消息,或输入一个数字如'100'处理部分消息): ")
|
186 |
+
|
187 |
+
if sample_size.lower() != "all":
|
188 |
+
try:
|
189 |
+
sample_size = int(sample_size)
|
190 |
+
if sample_size < len(messages):
|
191 |
+
print(f"将处理 {sample_size} 条消息作为样本")
|
192 |
+
messages = messages[:sample_size]
|
193 |
+
else:
|
194 |
+
print(f"样本大小大于等于总消息数,将处理所有 {len(messages)} 条消息")
|
195 |
+
except ValueError:
|
196 |
+
print("无效输入,将处理所有消息")
|
197 |
+
|
198 |
+
# 批量处理消息
|
199 |
+
print("开始处理消息...")
|
200 |
+
classified_messages = batch_process_messages(messages)
|
201 |
+
|
202 |
+
# 分离TruePositive from TrueNegative
|
203 |
+
positive_samples = [msg for msg in classified_messages if msg.get("classification") == "TruePositive"]
|
204 |
+
negative_samples = [msg for msg in classified_messages if msg.get("classification") == "TrueNegative"]
|
205 |
+
|
206 |
+
print(f"分类完成: TruePositive {len(positive_samples)} 条, TrueNegative {len(negative_samples)} 条")
|
207 |
+
|
208 |
+
# 保存结果
|
209 |
+
if input_file.endswith(".json"):
|
210 |
+
# 保存JSON格式
|
211 |
+
with open("positive_samples.json", "w", encoding="utf-8") as f:
|
212 |
+
json.dump(positive_samples, f, ensure_ascii=False, indent=2)
|
213 |
+
|
214 |
+
with open("negative_samples.json", "w", encoding="utf-8") as f:
|
215 |
+
json.dump(negative_samples, f, ensure_ascii=False, indent=2)
|
216 |
+
|
217 |
+
|
218 |
+
|
219 |
+
print("结果已保存到 positive_samples.json/csv 和 negative_samples.json/csv")
|
220 |
+
|
221 |
+
if __name__ == "__main__":
|
222 |
+
main()
|
LLM/Database/negative_samples.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[]
|
LLM/Database/positive_samples.json
ADDED
File without changes
|
LLM/Database/update_Messagejson.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import mysql.connector
|
2 |
+
from datetime import datetime # 导入 datetime 模块
|
3 |
+
import os
|
4 |
+
from pathlib import Path
|
5 |
+
import json
|
6 |
+
|
7 |
+
#Azure MySQL数据库连接
|
8 |
+
current_dir = Path(__file__).parent.absolute()
|
9 |
+
ssl_ca_path = current_dir / "DigiCertGlobalRootCA.crt.pem"
|
10 |
+
|
11 |
+
#写入json时对datetime类型进行序列化
|
12 |
+
def datetime_serializer(obj):
|
13 |
+
if isinstance(obj, datetime):
|
14 |
+
return obj.isoformat() # 将 datetime 转换为 ISO 8601 格式的字符串
|
15 |
+
raise TypeError("Type not serializable")
|
16 |
+
|
17 |
+
try:
|
18 |
+
# 建立数据库连接
|
19 |
+
cnx = mysql.connector.connect(
|
20 |
+
user="siyuwang541",
|
21 |
+
password="ToDoAgentASAP!1",
|
22 |
+
host="todoagent-databases.mysql.database.azure.com",
|
23 |
+
port=3306,
|
24 |
+
database="todoagent",
|
25 |
+
ssl_ca=str(ssl_ca_path),
|
26 |
+
ssl_disabled=False
|
27 |
+
)
|
28 |
+
|
29 |
+
print("数据库连接成功!")
|
30 |
+
|
31 |
+
# 测试查询
|
32 |
+
cursor = cnx.cursor()
|
33 |
+
cursor.execute("SELECT * FROM Messages")
|
34 |
+
# 获取表头(列名)
|
35 |
+
columns = [desc[0] for desc in cursor.description]
|
36 |
+
|
37 |
+
# 获取数据
|
38 |
+
rows = cursor.fetchall()
|
39 |
+
|
40 |
+
# 将表头和数据合并为字典列表
|
41 |
+
data = [dict(zip(columns, row)) for row in rows]
|
42 |
+
|
43 |
+
# 打印表头和数据
|
44 |
+
print("表头:", columns)
|
45 |
+
print("数据:")
|
46 |
+
for row in data:
|
47 |
+
print(row)
|
48 |
+
|
49 |
+
# 将数据写入 JSON 文件
|
50 |
+
with open("Messages.json", "w", encoding="utf-8") as file:
|
51 |
+
json.dump(data, file, ensure_ascii=False, indent=4, default=datetime_serializer) # datetime使用自定义序列化器
|
52 |
+
|
53 |
+
# 关闭连接
|
54 |
+
cursor.close()
|
55 |
+
cnx.close()
|
56 |
+
print("连接已正常关闭")
|
57 |
+
|
58 |
+
except mysql.connector.Error as err:
|
59 |
+
print(f"数据库错误: {err}")
|
60 |
+
except Exception as e:
|
61 |
+
print(f"发生异常: {str(e)}")
|
62 |
+
|
63 |
+
|
LLM/Notify/NotifyReadme.md
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!--
|
2 |
+
* @Description:
|
3 |
+
* @Author: Manda
|
4 |
+
* @Version:
|
5 |
+
* @Date: 2025-03-30 17:01:58
|
6 |
+
* @LastEditors: Manda
|
7 |
+
* @LastEditTime: 2025-04-03 13:39:35
|
8 |
+
-->
|
9 |
+
# 运行方式
|
10 |
+
确保DigiCertGlobalRootCA.crt.pem与config.yaml在同一文件夹内
|
11 |
+
运行方式 pip install -r requirements.txt 按装环境
|
12 |
+
运行 python notifyMain.py
|
13 |
+
|
14 |
+
## 20250403
|
15 |
+
### 更新数据库连接办法
|
16 |
+
启用Azure 与 GG老师的 yaml
|
17 |
+
更新requirements.txt,增加yaml
|
18 |
+
|
19 |
+
## 20250402
|
20 |
+
### notifyMain.py 主函数
|
21 |
+
主函数,每隔1小时调用下方三个py('db2txt.py', 'usrSpareTime.py', 'compareDb2txt.py')
|
22 |
+
运行方式 pip install -r requirements.txt 按装环境
|
23 |
+
运行 python notifyMain.py
|
24 |
+
-----------------------------
|
25 |
+
得到的效果是:
|
26 |
+
生成compare_output,output,time_analysis 文件夹
|
27 |
+
#### compare_output
|
28 |
+
对比相同user_ID的UCtodolist表 & todolist表 生成结果
|
29 |
+
#### output
|
30 |
+
拉取 todolist表 生成user_ID.txt
|
31 |
+
#### time_analysis
|
32 |
+
对比相同user_ID的UCtodolist表不同更改时段,生成结果
|
33 |
+
|
34 |
+
## 20250320
|
35 |
+
一些做RAG的数据文档以user_id来命名
|
36 |
+
|
37 |
+
### dataBaseConnecter
|
38 |
+
dataBaseConnecter.py实现连接服务器功能,并提供端口让其他py(如db2txt.py)将指定数据库内文本提取出来
|
39 |
+
|
40 |
+
### db2txt (好像直接在数据库完成了对比,这个文件貌似没啥用了)
|
41 |
+
db2txt.py 将ToDoAgent数据库中的ToDoList表格内容下载到txt中,按ToDoList表格中的user_id来命名txt,也就是不同的user_id有不同的txt
|
42 |
+
|
43 |
+
### usrSpareTime -->千人千面推送时间可以用到的RAG
|
44 |
+
usrSpareTime.py 将ToDoAgent数据库中的UCtodolist表格内容last_modified,数据获取出来分36个时段进行统计,统计出出现频率最高的6个时段, 将时段信息及出现次数下载到txt中,以相同“todo_id”为前提,查询ToDoList表格中的user_id来命名txt,也就是不同的user_id有不同的txt
|
45 |
+
|
46 |
+
### compareDb2txt-->自动生成ToDoList可以用到的的RAG
|
47 |
+
compareDb2txt.py 将ToDoAgent数据库中的UCtodolist表格内容与ToDoList做对比,以相同“todo_id”为前提,对比“start_time”"end_time""location""todo_content",一旦发现有差异,则将差异内容下载到txt中,按ToDoList表格中的user_id来命名txt,也就是不同的user_id有不同的txt
|
48 |
+
|
49 |
+
|
50 |
+
|
LLM/Notify/__pycache__/dataBaseConnecter.cpython-312.pyc
ADDED
Binary file (6.5 kB). View file
|
|
LLM/Notify/compareDb2txt.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# '''
|
2 |
+
# Description:
|
3 |
+
# Author: Manda
|
4 |
+
# Version:
|
5 |
+
# Date: 2025-03-30 16:28:58
|
6 |
+
# LastEditors: mdhuang555 [email protected]
|
7 |
+
# LastEditTime: 2025-03-30 16:39:18
|
8 |
+
# '''
|
9 |
+
from dataBaseConnecter import DatabaseConnector
|
10 |
+
import os
|
11 |
+
from datetime import datetime
|
12 |
+
import sys
|
13 |
+
import io
|
14 |
+
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
15 |
+
|
16 |
+
def get_table_data(db_connector: DatabaseConnector, table_name: str) -> dict:
|
17 |
+
"""获取表格数据,以todo_id为键"""
|
18 |
+
try:
|
19 |
+
# 连接数据库
|
20 |
+
conn = db_connector.connect_db()
|
21 |
+
if not conn:
|
22 |
+
print("无法连接到数据库")
|
23 |
+
return {}
|
24 |
+
|
25 |
+
cursor = conn.cursor(dictionary=True)
|
26 |
+
try:
|
27 |
+
# 使用连接器的extract_text方法获取数据
|
28 |
+
results = db_connector.extract_text(conn, table_name, '*')
|
29 |
+
# 将结果转换为以todo_id为键的字典
|
30 |
+
return {str(row['todo_id']): row for row in results}
|
31 |
+
finally:
|
32 |
+
cursor.close()
|
33 |
+
conn.close()
|
34 |
+
except Exception as e:
|
35 |
+
print(f"获取{table_name}数据错误: {e}")
|
36 |
+
return {}
|
37 |
+
|
38 |
+
def compare_records(todolist_record: dict, uctodolist_record: dict) -> dict:
|
39 |
+
"""比较两条记录的差异"""
|
40 |
+
differences = {}
|
41 |
+
fields_to_compare = ['start_time', 'end_time', 'location', 'todo_content']
|
42 |
+
|
43 |
+
for field in fields_to_compare:
|
44 |
+
todo_value = todolist_record.get(field)
|
45 |
+
uc_value = uctodolist_record.get(field)
|
46 |
+
|
47 |
+
# 特殊处理datetime类型的比较
|
48 |
+
if isinstance(todo_value, datetime):
|
49 |
+
todo_value = todo_value.strftime('%Y-%m-%d %H:%M:%S')
|
50 |
+
if isinstance(uc_value, datetime):
|
51 |
+
uc_value = uc_value.strftime('%Y-%m-%d %H:%M:%S')
|
52 |
+
|
53 |
+
if todo_value != uc_value:
|
54 |
+
differences[field] = {
|
55 |
+
'ToDoList': todo_value,
|
56 |
+
'UCtodolist': uc_value
|
57 |
+
}
|
58 |
+
|
59 |
+
return differences
|
60 |
+
|
61 |
+
def save_differences_to_file(differences: dict, output_dir: str = 'compare_output'):
|
62 |
+
"""将差异保存到文件中"""
|
63 |
+
if not os.path.exists(output_dir):
|
64 |
+
os.makedirs(output_dir)
|
65 |
+
|
66 |
+
# 按用户ID分组
|
67 |
+
user_differences = {}
|
68 |
+
for todo_id, diff in differences.items():
|
69 |
+
user_id = diff['user_id']
|
70 |
+
if user_id not in user_differences:
|
71 |
+
user_differences[user_id] = {}
|
72 |
+
user_differences[user_id][todo_id] = diff['differences']
|
73 |
+
|
74 |
+
# 为每个用户创建文件
|
75 |
+
for user_id, user_diffs in user_differences.items():
|
76 |
+
filename = os.path.join(output_dir, f'user_{user_id}_differences.txt')
|
77 |
+
with open(filename, 'w', encoding='utf-8') as f:
|
78 |
+
f.write(f"对比时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
79 |
+
f.write(f"用户ID: {user_id}\n")
|
80 |
+
f.write("=" * 50 + "\n\n")
|
81 |
+
|
82 |
+
for todo_id, diffs in user_diffs.items():
|
83 |
+
f.write(f"待办事项ID: {todo_id}\n")
|
84 |
+
for field, values in diffs.items():
|
85 |
+
f.write(f" 字段: {field}\n")
|
86 |
+
f.write(f" ToDoList值: {values['ToDoList']}\n")
|
87 |
+
f.write(f" UCtodolist值: {values['UCtodolist']}\n")
|
88 |
+
f.write("-" * 50 + "\n")
|
89 |
+
|
90 |
+
print(f"已保存用户 {user_id} 的差异到文件: {filename}")
|
91 |
+
|
92 |
+
def main():
|
93 |
+
print("正在连接数据库...")
|
94 |
+
|
95 |
+
try:
|
96 |
+
# 创建数据库连接器实例
|
97 |
+
db_connector = DatabaseConnector()
|
98 |
+
|
99 |
+
# 获取两个表的数据
|
100 |
+
print("正在获取表格数据...")
|
101 |
+
todolist_data = get_table_data(db_connector, 'ToDoList')
|
102 |
+
uctodolist_data = get_table_data(db_connector, 'UCtodolist')
|
103 |
+
|
104 |
+
# 比较差异
|
105 |
+
print("正在比较差异...")
|
106 |
+
differences = {}
|
107 |
+
for todo_id in set(todolist_data.keys()) & set(uctodolist_data.keys()):
|
108 |
+
todolist_record = todolist_data[todo_id]
|
109 |
+
uctodolist_record = uctodolist_data[todo_id]
|
110 |
+
|
111 |
+
record_differences = compare_records(todolist_record, uctodolist_record)
|
112 |
+
if record_differences:
|
113 |
+
differences[todo_id] = {
|
114 |
+
'user_id': todolist_record['user_id'],
|
115 |
+
'differences': record_differences
|
116 |
+
}
|
117 |
+
|
118 |
+
# 保存差异
|
119 |
+
if differences:
|
120 |
+
print(f"发现 {len(differences)} 条记录有差异")
|
121 |
+
save_differences_to_file(differences)
|
122 |
+
print("差异已保存到文件中")
|
123 |
+
else:
|
124 |
+
print("未发现差异")
|
125 |
+
|
126 |
+
except Exception as e:
|
127 |
+
print(f"处理过程中出错: {e}")
|
128 |
+
|
129 |
+
if __name__ == "__main__":
|
130 |
+
main()
|
LLM/Notify/dataBaseConnecter.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# '''
|
2 |
+
# Author: mdhuang555 [email protected]
|
3 |
+
# Date: 2025-03-30 15:57:22
|
4 |
+
# LastEditors: mdhuang555 [email protected]
|
5 |
+
# LastEditTime: 2025-04-03 11:32:30
|
6 |
+
# FilePath: \Notyif\dataBaseConnecter.py
|
7 |
+
# Description: 数据库连接器,支持SSL连接
|
8 |
+
# '''
|
9 |
+
import socket
|
10 |
+
import json
|
11 |
+
import mysql.connector
|
12 |
+
from typing import Dict, Any, Optional
|
13 |
+
import yaml
|
14 |
+
from pathlib import Path
|
15 |
+
|
16 |
+
class DatabaseConnector:
|
17 |
+
def __init__(self, host: str = '103.116.245.150', port: int = 3306):
|
18 |
+
self.host = host
|
19 |
+
self.port = port
|
20 |
+
self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
21 |
+
self.config = self._load_config()
|
22 |
+
|
23 |
+
def _load_config(self) -> Dict[str, Any]:
|
24 |
+
"""加载配置文件"""
|
25 |
+
try:
|
26 |
+
config_path = Path(__file__).parent / "config.yaml"
|
27 |
+
with open(config_path, "r", encoding="utf-8") as f:
|
28 |
+
return yaml.safe_load(f)
|
29 |
+
except Exception as e:
|
30 |
+
print(f"加载配置文件错误: {e}")
|
31 |
+
return {}
|
32 |
+
|
33 |
+
def connect_db(self) -> Optional[mysql.connector.MySQLConnection]:
|
34 |
+
"""连接到MySQL数据库,使用SSL连接"""
|
35 |
+
try:
|
36 |
+
# 获取SSL证书路径
|
37 |
+
current_dir = Path(__file__).parent.absolute()
|
38 |
+
ssl_ca_path = current_dir / "DigiCertGlobalRootCA.crt.pem"
|
39 |
+
|
40 |
+
# 确保SSL证书文件存在
|
41 |
+
if not ssl_ca_path.exists():
|
42 |
+
raise FileNotFoundError(f"SSL证书文件未找到: {ssl_ca_path}")
|
43 |
+
|
44 |
+
# 建立数据库连接
|
45 |
+
conn = mysql.connector.connect(
|
46 |
+
host=self.config["mysql"]["host"],
|
47 |
+
port=self.config["mysql"].get("port", 3306),
|
48 |
+
user=self.config["mysql"]["user"],
|
49 |
+
password=self.config["mysql"]["password"],
|
50 |
+
database=self.config["mysql"]["database"],
|
51 |
+
ssl_ca=str(ssl_ca_path),
|
52 |
+
ssl_disabled=False,
|
53 |
+
charset='utf8mb4',
|
54 |
+
collation='utf8mb4_unicode_ci'
|
55 |
+
)
|
56 |
+
return conn
|
57 |
+
except Exception as e:
|
58 |
+
print(f"数据库连接错误: {e}")
|
59 |
+
return None
|
60 |
+
|
61 |
+
def extract_text(self, conn: mysql.connector.MySQLConnection, table: str, column: str) -> list:
|
62 |
+
"""从指定表格和列中提取文本"""
|
63 |
+
try:
|
64 |
+
cursor = conn.cursor(dictionary=True)
|
65 |
+
# 如果请求所有列,则获取完整的行数据
|
66 |
+
if column == '*':
|
67 |
+
query = f"SELECT * FROM {table}"
|
68 |
+
else:
|
69 |
+
query = f"SELECT {column} FROM {table}"
|
70 |
+
cursor.execute(query)
|
71 |
+
results = cursor.fetchall()
|
72 |
+
cursor.close()
|
73 |
+
return results
|
74 |
+
except Exception as e:
|
75 |
+
print(f"提取文本错误: {e}")
|
76 |
+
return []
|
77 |
+
|
78 |
+
def start_server(self):
|
79 |
+
"""启动服务器监听请求"""
|
80 |
+
self.server_socket.bind((self.host, self.port))
|
81 |
+
self.server_socket.listen(5)
|
82 |
+
print(f"服务器启动在 {self.host}:{self.port}")
|
83 |
+
|
84 |
+
while True:
|
85 |
+
try:
|
86 |
+
client_socket, address = self.server_socket.accept()
|
87 |
+
print(f"接受来自 {address} 的连接")
|
88 |
+
|
89 |
+
# 接收客户端请求
|
90 |
+
data = client_socket.recv(1024).decode('utf-8')
|
91 |
+
request = json.loads(data)
|
92 |
+
|
93 |
+
# 处理请求
|
94 |
+
table = request.get('table')
|
95 |
+
column = request.get('column')
|
96 |
+
|
97 |
+
# 连接数据库并提取文本
|
98 |
+
conn = self.connect_db()
|
99 |
+
if conn:
|
100 |
+
try:
|
101 |
+
results = self.extract_text(conn, table, column)
|
102 |
+
response = {'status': 'success', 'data': results}
|
103 |
+
except Exception as e:
|
104 |
+
response = {'status': 'error', 'message': str(e)}
|
105 |
+
finally:
|
106 |
+
conn.close()
|
107 |
+
else:
|
108 |
+
response = {'status': 'error', 'message': '数据库连接失败'}
|
109 |
+
|
110 |
+
# 发送响应
|
111 |
+
response_data = json.dumps(response, ensure_ascii=False)
|
112 |
+
response_bytes = response_data.encode('utf-8')
|
113 |
+
|
114 |
+
# 先发送数据长度
|
115 |
+
length_prefix = len(response_bytes).to_bytes(4, byteorder='big')
|
116 |
+
client_socket.send(length_prefix)
|
117 |
+
|
118 |
+
# 再发送实际数据
|
119 |
+
client_socket.send(response_bytes)
|
120 |
+
client_socket.close()
|
121 |
+
|
122 |
+
except Exception as e:
|
123 |
+
print(f"处理请求错误: {e}")
|
124 |
+
continue
|
125 |
+
|
126 |
+
if __name__ == "__main__":
|
127 |
+
server = DatabaseConnector()
|
128 |
+
server.start_server()
|
LLM/Notify/db2txt.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# '''
|
3 |
+
# Author: mdhuang555 [email protected]
|
4 |
+
# Date: 2025-03-30 16:09:29
|
5 |
+
# LastEditors: mdhuang555 [email protected]
|
6 |
+
# LastEditTime: 2025-04-03 11:02:35
|
7 |
+
# FilePath: \Notify\db2txt.py
|
8 |
+
# Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
|
9 |
+
# '''
|
10 |
+
from dataBaseConnecter import DatabaseConnector
|
11 |
+
import sys
|
12 |
+
import io
|
13 |
+
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
14 |
+
import os
|
15 |
+
from datetime import datetime
|
16 |
+
|
17 |
+
def get_database_text(table: str) -> list:
|
18 |
+
"""使用DatabaseConnector从数据库获取数据"""
|
19 |
+
try:
|
20 |
+
# 创建数据库连接器实例
|
21 |
+
db_connector = DatabaseConnector()
|
22 |
+
|
23 |
+
# 连接数据库
|
24 |
+
conn = db_connector.connect_db()
|
25 |
+
if not conn:
|
26 |
+
print("无法连接到数据库")
|
27 |
+
return []
|
28 |
+
|
29 |
+
try:
|
30 |
+
# 使用连接器的extract_text方法获取数据
|
31 |
+
results = db_connector.extract_text(conn, table, '*')
|
32 |
+
return results
|
33 |
+
finally:
|
34 |
+
conn.close()
|
35 |
+
|
36 |
+
except Exception as e:
|
37 |
+
print(f"获取数据时发生错误: {e}")
|
38 |
+
return []
|
39 |
+
|
40 |
+
def save_todos_by_user(todos: list, output_dir: str = 'output'):
|
41 |
+
"""将待办事项按用户ID保存到不同的文本文件中"""
|
42 |
+
if not os.path.exists(output_dir):
|
43 |
+
os.makedirs(output_dir)
|
44 |
+
|
45 |
+
if not todos:
|
46 |
+
print("没有数据可以保存")
|
47 |
+
return
|
48 |
+
|
49 |
+
# 按用户ID分组
|
50 |
+
user_todos = {}
|
51 |
+
for todo in todos:
|
52 |
+
user_id = str(todo['user_id'])
|
53 |
+
if user_id not in user_todos:
|
54 |
+
user_todos[user_id] = []
|
55 |
+
user_todos[user_id].append(todo)
|
56 |
+
|
57 |
+
# 为每个用户创建文件
|
58 |
+
for user_id, user_todos_list in user_todos.items():
|
59 |
+
filename = os.path.join(output_dir, f'{user_id}.txt')
|
60 |
+
try:
|
61 |
+
with open(filename, 'w', encoding='utf-8') as f:
|
62 |
+
f.write(f"导出时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
63 |
+
f.write(f"用户ID: {user_id}\n")
|
64 |
+
f.write("=" * 50 + "\n\n")
|
65 |
+
|
66 |
+
for todo in user_todos_list:
|
67 |
+
f.write("待办事项:\n")
|
68 |
+
for key, value in todo.items():
|
69 |
+
if value is not None: # 只写入非空值
|
70 |
+
f.write(f" {key}: {value}\n")
|
71 |
+
f.write("-" * 50 + "\n")
|
72 |
+
print(f"已保存用户 {user_id} 的待办事项到文件: {filename}")
|
73 |
+
except Exception as e:
|
74 |
+
print(f"保存用户 {user_id} 的数据时出错: {e}")
|
75 |
+
|
76 |
+
def main():
|
77 |
+
print("正在连接数据库...")
|
78 |
+
todos = get_database_text('ToDoList')
|
79 |
+
|
80 |
+
if todos:
|
81 |
+
print(f"成功获取 {len(todos)} 条记录")
|
82 |
+
save_todos_by_user(todos)
|
83 |
+
print("所有数据已保存完成")
|
84 |
+
else:
|
85 |
+
print("未能获取到数据")
|
86 |
+
|
87 |
+
if __name__ == "__main__":
|
88 |
+
main()
|
LLM/Notify/notifyMain.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# '''
|
3 |
+
# Description: 定时运行数据库相关脚本的主程序
|
4 |
+
# Author: Manda
|
5 |
+
# Version: 1.0
|
6 |
+
# Date: 2024-03-30
|
7 |
+
# '''
|
8 |
+
import schedule
|
9 |
+
import time
|
10 |
+
import subprocess
|
11 |
+
import logging
|
12 |
+
from datetime import datetime
|
13 |
+
import os
|
14 |
+
import sys
|
15 |
+
import io
|
16 |
+
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
17 |
+
|
18 |
+
# 配置日志
|
19 |
+
logging.basicConfig(
|
20 |
+
level=logging.INFO,
|
21 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
22 |
+
handlers=[
|
23 |
+
logging.FileHandler('notify_main.log', encoding='utf-8'),
|
24 |
+
logging.StreamHandler()
|
25 |
+
]
|
26 |
+
)
|
27 |
+
|
28 |
+
|
29 |
+
def run_script(script_name: str):
|
30 |
+
"""运行指定的Python脚本"""
|
31 |
+
try:
|
32 |
+
logging.info(f"开始运行脚本: {script_name}")
|
33 |
+
result = subprocess.run(['python', script_name],
|
34 |
+
capture_output=True,
|
35 |
+
text=True,
|
36 |
+
encoding='utf-8') # 明确指定使用 UTF-8 编码
|
37 |
+
|
38 |
+
if result.returncode == 0:
|
39 |
+
logging.info(f"脚本 {script_name} 运行成功")
|
40 |
+
if result.stdout:
|
41 |
+
logging.info(f"输出: {result.stdout}")
|
42 |
+
else:
|
43 |
+
logging.error(f"脚本 {script_name} 运行失败")
|
44 |
+
if result.stderr:
|
45 |
+
logging.error(f"错误: {result.stderr}")
|
46 |
+
except Exception as e:
|
47 |
+
logging.error(f"运行脚本 {script_name} 时发生错误: {str(e)}")
|
48 |
+
|
49 |
+
|
50 |
+
def run_all_scripts():
|
51 |
+
"""运行所有脚本"""
|
52 |
+
logging.info("开始执行所有脚本")
|
53 |
+
|
54 |
+
# 获取当前脚本所在的目录
|
55 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
56 |
+
scripts = ['db2txt.py', 'usrSpareTime.py', 'compareDb2txt.py']
|
57 |
+
|
58 |
+
for script in scripts:
|
59 |
+
# 使用完整的文件路径
|
60 |
+
script_path = os.path.join(current_dir, script)
|
61 |
+
if os.path.exists(script_path):
|
62 |
+
run_script(script_path)
|
63 |
+
time.sleep(5)
|
64 |
+
else:
|
65 |
+
logging.error(f"脚本文件不存在: {script_path}")
|
66 |
+
|
67 |
+
logging.info("所有脚本执行完成")
|
68 |
+
|
69 |
+
def main():
|
70 |
+
logging.info("启动定时任务程序")
|
71 |
+
|
72 |
+
# 设置每小时运行一次
|
73 |
+
schedule.every().hour.at(":43").do(run_all_scripts)
|
74 |
+
|
75 |
+
# 立即运行一次
|
76 |
+
run_all_scripts()
|
77 |
+
|
78 |
+
# 持续运行
|
79 |
+
while True:
|
80 |
+
try:
|
81 |
+
schedule.run_pending()
|
82 |
+
time.sleep(60)
|
83 |
+
except Exception as e:
|
84 |
+
logging.error(f"运行时发生错误: {str(e)}")
|
85 |
+
time.sleep(60) # 发生错误时等待一分钟后继续
|
86 |
+
|
87 |
+
if __name__ == "__main__":
|
88 |
+
main()
|
LLM/Notify/usrSpareTime.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# '''
|
2 |
+
# Description:
|
3 |
+
# Author: Manda
|
4 |
+
# Version:
|
5 |
+
# Date: 2025-03-30 16:42:47
|
6 |
+
# LastEditors: mdhuang555 [email protected]
|
7 |
+
# LastEditTime: 2025-03-30 16:59:19
|
8 |
+
# '''
|
9 |
+
import mysql.connector
|
10 |
+
import os
|
11 |
+
from datetime import datetime
|
12 |
+
from collections import defaultdict
|
13 |
+
from dataBaseConnecter import DatabaseConnector
|
14 |
+
import sys
|
15 |
+
import io
|
16 |
+
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
17 |
+
|
18 |
+
def connect_to_database(db_config: dict) -> mysql.connector.MySQLConnection:
|
19 |
+
"""连接到MySQL数据库"""
|
20 |
+
try:
|
21 |
+
conn = mysql.connector.connect(
|
22 |
+
host=db_config['host'],
|
23 |
+
user=db_config['user'],
|
24 |
+
password=db_config['password'],
|
25 |
+
database=db_config['database'],
|
26 |
+
charset='utf8mb4'
|
27 |
+
)
|
28 |
+
return conn
|
29 |
+
except Exception as e:
|
30 |
+
print(f"数据库连接错误: {e}")
|
31 |
+
return None
|
32 |
+
|
33 |
+
def get_time_slot(hour: int, minute: int) -> str:
|
34 |
+
"""将时间转换为40分钟一段的时间段"""
|
35 |
+
# 计算一天中的第几个40分钟
|
36 |
+
total_minutes = hour * 60 + minute
|
37 |
+
slot_index = total_minutes // 40
|
38 |
+
|
39 |
+
# 计算时间段的起始和结束时间
|
40 |
+
start_minutes = slot_index * 40
|
41 |
+
end_minutes = start_minutes + 40
|
42 |
+
|
43 |
+
start_hour = start_minutes // 60
|
44 |
+
start_minute = start_minutes % 60
|
45 |
+
end_hour = end_minutes // 60
|
46 |
+
end_minute = end_minutes % 60
|
47 |
+
|
48 |
+
# 格式化时间段字符串
|
49 |
+
return f"{start_hour:02d}:{start_minute:02d}-{end_hour:02d}:{end_minute:02d}"
|
50 |
+
|
51 |
+
def analyze_time_slots(db_connector: DatabaseConnector) -> dict:
|
52 |
+
"""分析时间段分布"""
|
53 |
+
try:
|
54 |
+
# 连接数据库
|
55 |
+
conn = db_connector.connect_db()
|
56 |
+
if not conn:
|
57 |
+
print("无法连接到数据库")
|
58 |
+
return {}
|
59 |
+
|
60 |
+
cursor = conn.cursor(dictionary=True)
|
61 |
+
|
62 |
+
# 获取UCtodolist的数据和对应的ToDoList用户ID
|
63 |
+
query = """
|
64 |
+
SELECT uc.todo_id, uc.last_modified, t.user_id
|
65 |
+
FROM UCtodolist uc
|
66 |
+
JOIN ToDoList t ON uc.todo_id = t.todo_id
|
67 |
+
WHERE uc.last_modified IS NOT NULL
|
68 |
+
"""
|
69 |
+
cursor.execute(query)
|
70 |
+
results = cursor.fetchall()
|
71 |
+
|
72 |
+
# 按用户ID分组统计时间段
|
73 |
+
user_time_slots = defaultdict(lambda: defaultdict(int))
|
74 |
+
|
75 |
+
for row in results:
|
76 |
+
if isinstance(row['last_modified'], datetime):
|
77 |
+
hour = row['last_modified'].hour
|
78 |
+
minute = row['last_modified'].minute
|
79 |
+
time_slot = get_time_slot(hour, minute)
|
80 |
+
user_time_slots[row['user_id']][time_slot] += 1
|
81 |
+
|
82 |
+
return dict(user_time_slots)
|
83 |
+
|
84 |
+
except Exception as e:
|
85 |
+
print(f"分析时间段时出错: {e}")
|
86 |
+
return {}
|
87 |
+
finally:
|
88 |
+
if 'cursor' in locals():
|
89 |
+
cursor.close()
|
90 |
+
if 'conn' in locals() and conn:
|
91 |
+
conn.close()
|
92 |
+
|
93 |
+
def save_analysis_results(results: dict, output_dir: str = 'time_analysis'):
|
94 |
+
"""保存分析结果到文件"""
|
95 |
+
if not os.path.exists(output_dir):
|
96 |
+
os.makedirs(output_dir)
|
97 |
+
|
98 |
+
for user_id, time_slots in results.items():
|
99 |
+
filename = os.path.join(output_dir, f'user_{user_id}_time_analysis.txt')
|
100 |
+
|
101 |
+
try:
|
102 |
+
with open(filename, 'w', encoding='utf-8') as f:
|
103 |
+
f.write(f"分析时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
104 |
+
f.write(f"用户ID: {user_id}\n")
|
105 |
+
f.write("=" * 50 + "\n\n")
|
106 |
+
|
107 |
+
f.write("时间段使用频率统计(前6名):\n")
|
108 |
+
# 按频率排序并获取前6个时段
|
109 |
+
top_slots = sorted(time_slots.items(), key=lambda x: x[1], reverse=True)[:6]
|
110 |
+
|
111 |
+
for i, (slot, count) in enumerate(top_slots, 1):
|
112 |
+
f.write(f"第{i}名: {slot}\n")
|
113 |
+
f.write(f" 出现次数: {count}\n")
|
114 |
+
percentage = (count / sum(time_slots.values())) * 100
|
115 |
+
f.write(f" 占比: {percentage:.2f}%\n")
|
116 |
+
f.write("-" * 30 + "\n")
|
117 |
+
|
118 |
+
# 添加总计信息
|
119 |
+
f.write(f"\n总修改次数: {sum(time_slots.values())}\n")
|
120 |
+
f.write(f"总时间段数: {len(time_slots)}/36\n")
|
121 |
+
|
122 |
+
print(f"已保存用户 {user_id} 的时间分析到文件: {filename}")
|
123 |
+
|
124 |
+
except Exception as e:
|
125 |
+
print(f"保存用户 {user_id} 的分析结果时出错: {e}")
|
126 |
+
|
127 |
+
def main():
|
128 |
+
print("正在连接数据库...")
|
129 |
+
|
130 |
+
try:
|
131 |
+
# 创建数据库连接器实例
|
132 |
+
db_connector = DatabaseConnector()
|
133 |
+
|
134 |
+
print("正在分析时间段分布...")
|
135 |
+
results = analyze_time_slots(db_connector)
|
136 |
+
|
137 |
+
if results:
|
138 |
+
print(f"分析完成,共有 {len(results)} 个用户的数据")
|
139 |
+
save_analysis_results(results)
|
140 |
+
print("分析结果已保存到文件中")
|
141 |
+
else:
|
142 |
+
print("未找到可分析的数据")
|
143 |
+
|
144 |
+
except Exception as e:
|
145 |
+
print(f"处理过程中出错: {e}")
|
146 |
+
|
147 |
+
if __name__ == "__main__":
|
148 |
+
main()
|
LLM/filter_message/README.MD
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
## 安装依赖
|
3 |
+
|
4 |
+
|
5 |
+
```
|
6 |
+
pip install -r requirements.txt
|
7 |
+
```
|
8 |
+
|
9 |
+
## 运行示例
|
10 |
+
|
11 |
+
```
|
12 |
+
python main.py
|
13 |
+
```
|
14 |
+
|
15 |
+
|
16 |
+
## 查看结果
|
17 |
+
|
18 |
+
data目录下的json文件(未清洗 )
|
LLM/filter_message/__pycache__/libs.cpython-312.pyc
ADDED
Binary file (8.75 kB). View file
|
|
LLM/filter_message/libs.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
|
3 |
+
import yaml
|
4 |
+
import pymysql
|
5 |
+
from openai import OpenAI
|
6 |
+
|
7 |
+
def read_config(yaml_file):
|
8 |
+
"""从yaml文件读取配置"""
|
9 |
+
with open(yaml_file, "r", encoding="utf-8") as f:
|
10 |
+
return yaml.safe_load(f)
|
11 |
+
|
12 |
+
|
13 |
+
import mysql.connector
|
14 |
+
import os
|
15 |
+
from pathlib import Path
|
16 |
+
|
17 |
+
|
18 |
+
def get_db_conn():
|
19 |
+
"""获取复用数据库链接 (Azure MySQL) """
|
20 |
+
config = CONFIG["mysql"]
|
21 |
+
|
22 |
+
# 获取 SSL 证书路径
|
23 |
+
current_dir = Path(__file__).parent.absolute()
|
24 |
+
ssl_ca_path = current_dir / "DigiCertGlobalRootCA.crt.pem"
|
25 |
+
|
26 |
+
# 建立数据库连接
|
27 |
+
conn = mysql.connector.connect(
|
28 |
+
host=config["host"],
|
29 |
+
port=config.get("port", 3306),
|
30 |
+
user=config["user"],
|
31 |
+
password=config["password"],
|
32 |
+
database=config["database"],
|
33 |
+
ssl_ca=str(ssl_ca_path),
|
34 |
+
ssl_disabled=False
|
35 |
+
)
|
36 |
+
|
37 |
+
return conn
|
38 |
+
|
39 |
+
|
40 |
+
def execute_sql(sql):
|
41 |
+
"""执行sql"""
|
42 |
+
with DB_CONN.cursor() as cursor:
|
43 |
+
cursor.execute(sql)
|
44 |
+
|
45 |
+
# 判断是否为SELECT语句
|
46 |
+
if sql.strip().upper().startswith("SELECT"):
|
47 |
+
result = cursor.fetchall()
|
48 |
+
DB_CONN.commit() # 提交事务,虽然SELECT语句不需要,但养成习惯
|
49 |
+
return result
|
50 |
+
else:
|
51 |
+
affected_rows = cursor.rowcount
|
52 |
+
DB_CONN.commit() # 提交事务,INSERT/UPDATE/DELETE需要提交
|
53 |
+
return affected_rows
|
54 |
+
|
55 |
+
|
56 |
+
def release():
|
57 |
+
"""释放资源"""
|
58 |
+
DB_CONN.close()
|
59 |
+
|
60 |
+
del CONFIG
|
61 |
+
del DB_CONN
|
62 |
+
|
63 |
+
|
64 |
+
def get_llm():
|
65 |
+
config = CONFIG["openai"]
|
66 |
+
client = OpenAI(base_url=config["base_url"], api_key=config["api_key"])
|
67 |
+
|
68 |
+
return client
|
69 |
+
|
70 |
+
|
71 |
+
def send_llm(messages: list[dict[str, str]], model: Optional[str] = None, resp_json=False):
|
72 |
+
"""调用LLM"""
|
73 |
+
print(">>>>>>>>>>>>>>>>>",messages)
|
74 |
+
config = CONFIG["openai"]
|
75 |
+
|
76 |
+
if model is None:
|
77 |
+
model = config["model"]
|
78 |
+
|
79 |
+
if resp_json:
|
80 |
+
completion = LLM.chat.completions.create(
|
81 |
+
model=model, # 选择模型
|
82 |
+
messages=messages,
|
83 |
+
temperature=0, # 为提高准确率,温度为0
|
84 |
+
response_format={ "type": "json_object" },
|
85 |
+
)
|
86 |
+
else:
|
87 |
+
completion = LLM.chat.completions.create(
|
88 |
+
model=model, # 选择模型
|
89 |
+
messages=messages,
|
90 |
+
temperature=0, # 为提高准确率,温度为0
|
91 |
+
)
|
92 |
+
|
93 |
+
print("<<<<<",completion.choices[0].message.content)
|
94 |
+
return completion.choices[0].message.content
|
95 |
+
|
96 |
+
|
97 |
+
def send_llm_with_query(query):
|
98 |
+
messages = {
|
99 |
+
"role": "user",
|
100 |
+
"content": query,
|
101 |
+
},
|
102 |
+
return send_llm(messages)
|
103 |
+
|
104 |
+
|
105 |
+
def send_llm_with_prompt(query):
|
106 |
+
system = """
|
107 |
+
# 角色
|
108 |
+
你是一个专业的短信内容分析助手,根据输入判断内容的类型及可信度,为用户使用信息提供依据和便利。
|
109 |
+
|
110 |
+
# 任务
|
111 |
+
对于输入的多条数据,分析每一条数据内容(主键:`message_id`)属于【物流取件、缴费充值、待付(还)款、会议邀约、其他】的可能性百分比。
|
112 |
+
主要对于聊天、问候、回执、结果通知、上月账单等信息不需要收件人进行下一步处理的信息,直接归到其他类进行忽略
|
113 |
+
|
114 |
+
# 要求
|
115 |
+
1. 以json格式输出
|
116 |
+
2. content简洁提炼关键词,字符数<20以内
|
117 |
+
3. 输入条数和输出条数完全一样
|
118 |
+
|
119 |
+
# 输出示例
|
120 |
+
```
|
121 |
+
[
|
122 |
+
{"message_id":"1111111","content":"账单805.57元待还","物流取件":0,"欠费缴纳":99,"待付(还)款":1: "会议邀约":0,"其他":0, "分类":"欠费缴纳"},
|
123 |
+
{"message_id":"222222","content":"邀请你加入飞书视频会议","物流取件":0,"欠费缴纳":0,"待付(还)款":1: "会议邀约":100,"其他":0, "分类":"会议"}
|
124 |
+
]
|
125 |
+
|
126 |
+
```
|
127 |
+
"""
|
128 |
+
|
129 |
+
messages = [
|
130 |
+
{
|
131 |
+
"role": "system",
|
132 |
+
"content": system,
|
133 |
+
},
|
134 |
+
{
|
135 |
+
"role": "user",
|
136 |
+
"content": str(query),
|
137 |
+
}
|
138 |
+
]
|
139 |
+
return send_llm(messages)
|
140 |
+
|
141 |
+
def save_to_mysql(data):
|
142 |
+
"""新增:保存数据到 MySQL"""
|
143 |
+
# 字段映射关系(中文键名 → 数据库英文列名)
|
144 |
+
COLUMN_MAPPING = {
|
145 |
+
"message_id": "message_id",
|
146 |
+
"content": "content",
|
147 |
+
"物流取件": "logistics_pickup",
|
148 |
+
"欠费缴纳": "overdue_payment",
|
149 |
+
"待付(还)款": "pending_payment",
|
150 |
+
"会议邀约": "meeting_invitation",
|
151 |
+
"其他": "other",
|
152 |
+
"分类": "category"
|
153 |
+
}
|
154 |
+
|
155 |
+
BATCH_SIZE = 100 # 每次插入 100 行,减少锁冲突
|
156 |
+
conn = get_db_conn()
|
157 |
+
|
158 |
+
try:
|
159 |
+
with conn.cursor() as cursor:
|
160 |
+
sql = f"""
|
161 |
+
INSERT INTO message_stats
|
162 |
+
({', '.join(COLUMN_MAPPING.values())})
|
163 |
+
VALUES ({', '.join(['%s'] * len(COLUMN_MAPPING))})
|
164 |
+
ON DUPLICATE KEY UPDATE
|
165 |
+
{', '.join([f"{col} = VALUES({col})" for col in COLUMN_MAPPING.values() if col != 'message_id'])}
|
166 |
+
"""
|
167 |
+
|
168 |
+
values = []
|
169 |
+
for item in data:
|
170 |
+
item["content"] = str(item["content"]).encode('utf-8').decode('utf-8', errors='ignore')
|
171 |
+
|
172 |
+
# 规则 1: 会议 且 content 不包含 "邀请你加入飞书视频会议",归类为 "其他"
|
173 |
+
# if item.get("分类") == "会议邀约" and "邀请你加入飞书视频会议" not in item.get("content", ""):
|
174 |
+
# item["分类"] = "其他"
|
175 |
+
|
176 |
+
# 规则 2: 欠费缴纳 且 content 包含 "缴费支出",归类为 "其他"
|
177 |
+
if item.get("分类") == "欠费缴纳" and "缴费支出" in item.get("content", ""):
|
178 |
+
item["分类"] = "其他"
|
179 |
+
|
180 |
+
row = [item.get(key, None) for key in COLUMN_MAPPING.keys()]
|
181 |
+
values.append(row)
|
182 |
+
|
183 |
+
# 分批插入 message_stats
|
184 |
+
for i in range(0, len(values), BATCH_SIZE):
|
185 |
+
batch = values[i: i + BATCH_SIZE]
|
186 |
+
cursor.executemany(sql, batch)
|
187 |
+
conn.commit()
|
188 |
+
print(f"成功插入 {len(batch)} 条数据到 message_stats")
|
189 |
+
|
190 |
+
# **3. 插入 `filter_message` 表,仅插入分类不等于“其他”的数据**
|
191 |
+
filter_sql = """
|
192 |
+
INSERT IGNORE INTO filter_message (message_id, content)
|
193 |
+
VALUES (%s, %s)
|
194 |
+
"""
|
195 |
+
|
196 |
+
filter_values = [
|
197 |
+
(item.get("message_id"), item.get("content")) for item in data if item.get("分类") != "其他"
|
198 |
+
]
|
199 |
+
|
200 |
+
# 分批插入 filter_message
|
201 |
+
for i in range(0, len(filter_values), BATCH_SIZE):
|
202 |
+
batch = filter_values[i: i + BATCH_SIZE]
|
203 |
+
cursor.executemany(filter_sql, batch)
|
204 |
+
conn.commit()
|
205 |
+
print(f"成功插入 {len(batch)} 条数据到 filter_message")
|
206 |
+
|
207 |
+
except pymysql.MySQLError as e:
|
208 |
+
conn.rollback()
|
209 |
+
print(f"数据插入失败: {e}")
|
210 |
+
|
211 |
+
finally:
|
212 |
+
conn.close()
|
213 |
+
###### init #####
|
214 |
+
|
215 |
+
CONFIG = read_config("filter_llm_config.yaml")
|
216 |
+
DB_CONN = get_db_conn()
|
217 |
+
LLM = get_llm()
|
LLM/filter_message/main.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# main.py
|
2 |
+
import json
|
3 |
+
import time
|
4 |
+
|
5 |
+
from libs import save_to_mysql, execute_sql, send_llm_with_prompt
|
6 |
+
|
7 |
+
|
8 |
+
def get_message_with_page(page_num, page_size=10):
|
9 |
+
"""分页查询数据库(保持不变)"""
|
10 |
+
offset = (page_num) * page_size
|
11 |
+
sql = f"""
|
12 |
+
SELECT
|
13 |
+
m.content,
|
14 |
+
MAX(m.message_id) as message_id,
|
15 |
+
MAX(m.date) as date
|
16 |
+
FROM
|
17 |
+
Messages m
|
18 |
+
left join message_stats n on m.message_id=n.message_id
|
19 |
+
WHERE
|
20 |
+
m.app_name IN ('com.tencent.mm', 'SMS', 'com.ss.android.lark','com.ss.android.teams')
|
21 |
+
AND DATE(m.date) >= '2025-04-26'
|
22 |
+
AND LENGTH(m.content) > 50
|
23 |
+
AND m.content NOT LIKE '%可用余额%'
|
24 |
+
and n.message_id is null
|
25 |
+
GROUP BY
|
26 |
+
m.content
|
27 |
+
LIMIT {page_size} OFFSET {offset};
|
28 |
+
"""
|
29 |
+
print("======", sql)
|
30 |
+
return execute_sql(sql)
|
31 |
+
|
32 |
+
|
33 |
+
def get_page_count(page_size=10): # 3,数据库分页数,该怎么自动计算;
|
34 |
+
sql = f"""
|
35 |
+
SELECT
|
36 |
+
COUNT(1) AS total_rows
|
37 |
+
FROM
|
38 |
+
(
|
39 |
+
SELECT
|
40 |
+
m.content
|
41 |
+
FROM
|
42 |
+
Messages m
|
43 |
+
LEFT JOIN message_stats n ON m.message_id = n.message_id
|
44 |
+
WHERE
|
45 |
+
m.app_name IN ('com.tencent.mm', 'SMS', 'com.ss.android.lark')
|
46 |
+
AND DATE(m.date) >= '2025-03-31'
|
47 |
+
AND LENGTH(m.content) > 50
|
48 |
+
AND m.content NOT LIKE '%可用余额%'
|
49 |
+
AND n.message_id IS NULL
|
50 |
+
GROUP BY
|
51 |
+
m.content
|
52 |
+
) AS subquery;
|
53 |
+
"""
|
54 |
+
rows = execute_sql(sql)
|
55 |
+
count = rows[0][0]
|
56 |
+
|
57 |
+
if count % page_size == 0:
|
58 |
+
page_count = count // page_size
|
59 |
+
else:
|
60 |
+
page_count = (count // page_size) + 1
|
61 |
+
|
62 |
+
print(f'表中数据共 {page_count} 页')
|
63 |
+
return page_count
|
64 |
+
|
65 |
+
|
66 |
+
def data_to_todo(data: tuple[tuple], todo_list: list[dict], debug=False): # 1,查漏补缺message id,保证输入message数=输出数,最终miss_rate=0
|
67 |
+
request_id_list = {int(d[1]) for d in data}
|
68 |
+
|
69 |
+
resp = send_llm_with_prompt(data)
|
70 |
+
print(f'请求id: {len(request_id_list)} ->{request_id_list}')
|
71 |
+
|
72 |
+
resp = resp.replace("```json", "").replace("```", "")
|
73 |
+
print(' ' + resp)
|
74 |
+
print(' ' + '-' * 20)
|
75 |
+
|
76 |
+
try:
|
77 |
+
parsed_resp = json.loads(resp)
|
78 |
+
response_id_list = {int(d['message_id']) for d in parsed_resp}
|
79 |
+
print(f'响应id: {len(response_id_list)} -> {response_id_list}')
|
80 |
+
|
81 |
+
diff = request_id_list - response_id_list
|
82 |
+
if diff:
|
83 |
+
print(f'本次处理有遗漏:{diff}')
|
84 |
+
diff_data = tuple(d for d in data if d[1] in diff)
|
85 |
+
data_to_todo(diff_data, todo_list)
|
86 |
+
|
87 |
+
todo_list.extend(parsed_resp)
|
88 |
+
print(f'{len(todo_list)=}')
|
89 |
+
except Exception as e:
|
90 |
+
print(f"解析响应失败: {e}")
|
91 |
+
|
92 |
+
|
93 |
+
def main():
|
94 |
+
todo_list = []
|
95 |
+
|
96 |
+
page_count = get_page_count()
|
97 |
+
|
98 |
+
""" 从0开始算分页 """
|
99 |
+
for i in range(0, page_count):
|
100 |
+
print(f'正在处理第 {i + 1} / {page_count} 页数据')
|
101 |
+
data = get_message_with_page(i)
|
102 |
+
if not data:
|
103 |
+
print('没有更多数据了')
|
104 |
+
break
|
105 |
+
|
106 |
+
data_to_todo(data, todo_list)
|
107 |
+
|
108 |
+
# 保存到MySQL
|
109 |
+
if todo_list:
|
110 |
+
save_to_mysql(todo_list)
|
111 |
+
print(f"成功保存{len(todo_list)}条数据到数据库")
|
112 |
+
else:
|
113 |
+
print("没有需要保存的数据")
|
114 |
+
|
115 |
+
|
116 |
+
def main_loop(): # 2,当表新增一条信息,自动化调用llm处理;
|
117 |
+
while True:
|
118 |
+
main()
|
119 |
+
time.sleep(30)
|
120 |
+
|
121 |
+
|
122 |
+
if __name__ == '__main__':
|
123 |
+
main()
|
124 |
+
# main_loop()
|
LLM/filter_message/prompt.md
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## 人工标注数据
|
2 |
+
|
3 |
+
| content | 是否正负样本 | 关键词 | 类型 |
|
4 |
+
| ------------------------------------------------------------ | ------------ | ------------------------ | -------- |
|
5 |
+
| 【建设银行】您账户8699于3月24日12时22分向微信支付-扫二维码付款支出人民币15元,可用余额5433.72元。 | FALSE | 可用余额 | |
|
6 |
+
| 【建设银行】您账户8699于3月24日12时11分向支付宝-天猫-深圳市升景科技有限公司支出人民币1889元,可用余额5465.6元。 | FALSE | 可用余额 | |
|
7 |
+
| 【建设银行】您账户8699于3月24日12时11分向支付宝-天弘基金管理有限公司支出人民币16.88元,可用余额5448.72元。 | FALSE | 可用余额 | |
|
8 |
+
| 【驿收发】您的京东包裹已到惜福堂对面快递驿站,请21:00前凭2-0147来取,详询16675200600 | TRUE | 驿站、包裹、取件 | 包裹物流 |
|
9 |
+
| 【中国农业银行】为您特别准备了一份专属月度账单,登掌银搜“月度账单”或戳 go.abchina.com/k/CS7 查收。拒收请回复R | FALSE | 拒收 | |
|
10 |
+
| 【中国农业银行】充话费最高减20元,本月登掌银【城市专区-优惠立减】或点 go.abchina.com/k/C05 先到先得。拒收请回复R | FALSE | 拒收 | |
|
11 |
+
| 刻刻: 【火山引擎】亲爱的用户,您有千万DeepSeek模型额度待领取哦!每邀请1位新用户注册和使用,最高可获得130元代金券,多邀多得上不封顶!点击下载海报,分享到更多渠道:szacq.cn/ewyeb/ 拒收请回复R | FALSE | 拒收 | |
|
12 |
+
| M丶D: 【阿里云推广】尊敬的mandahuang,老友季钜惠!云服务器 99元/年,新购续费同享!戳>>https://t.aliyun.com/TIFqsCZY 立即抢购! 拒收请回复R | FALSE | 拒收 | |
|
13 |
+
| 王斯煜[表情]Vince 黑客松nv: 【AlipayHK】於3月19日 08:15需要通過Apple 服務(Apple services)待付款支付HKD8.00 | TRUE | 账单、待付款 | 支付 |
|
14 |
+
| [2条]李JK老师-1228: 【百度】亲爱的开发者您好,文心智能体年底送福利,即日起至12月30日,创建并提交您的智能体,即有机会获得现金卡福利,最高500元,多档奖励,中奖机会大!登陆天数越多,创建有创意,中奖率越高!福利倒计时!点击参与!https://agents.baidu.com/activity/detail/13 | FALSE | 送福利、有机会 | |
|
15 |
+
| [5条]LJK86: 【百度】亲爱的开发者您好,文心智能体年底送福利,即日起至12月30日,创建并提交您的智能体,即有机会获得现金卡福利,最高500元,多档奖励,中奖机会大!登陆天数越多,创建有创意,中奖率越高!福利倒计时!点击参与!https://agents.baidu.com/activity/detail/13 | FALSE | 送福利、有机会 | |
|
16 |
+
| [3条]M丶D: 【韵达快递】亲434466097408983超10小时未取出,如需帮助或有问题请致电15900077340 、020-89725127 | TRUE | 快递、取件 | 包裹物流 |
|
17 |
+
| [2条]LJK86: 【智谱AI】亲爱的开发者您好,感谢您参与智谱开放平台满意度调研,您的智谱清言月卡奖励正在发放,请在链接中输入您问卷中填写的手机号查询礼品码,前往PC/APP智谱清言会员充值页,选择“礼品码兑换”。查询链接:https://zhipu-ai.feishu.cn/share/base/query/shr... | FALSE | 满意度调研 | |
|
18 |
+
| [2条]M丶D: 【中国电信】流量满满,温暖相伴,我们特别为你准备了预存领10GB的流量大礼,无论是与家人视频通话,还是朋友间分享趣事,都能畅通无阻,让爱不断线。马上戳 https://vipxjzl.mini189.cn/BG/ 了解吧,具体以实际页面展示为准,如已办理请忽略,转发无效。拒收请回复R | FALSE | 拒收 | |
|
19 |
+
| [3条]王斯煜[表情]Vince 黑客松nv: 【中通快递】包裹已到深圳光明正大城商业街103号店,取件码3-5-1092。到店扫“取件二维码”,线上查询更方便!询19128399078 | TRUE | 快递、包裹、取件码 | 包裹物流 |
|
20 |
+
| 王斯煜[表情]Vince 黑客松nv: 尊敬的*斯煜,您在我行办理的1笔个人贷款需于2024年12月08日17:00前还款,当期还款金额本息合计999999.84元,请您留意尾号0455的账户可用余额是否充足,避免因贷款逾期影响个人征信。具体贷款信息可通过工行网上银行、手机银行或致电贷款经办行查询。【工商银行】 | TRUE | 银行、贷款、还款 | 支付 |
|
21 |
+
| [2条]李JK老师-1228: 【51CTO学堂】《DeepSeek训练营》火爆来袭!0元报名,学习AI核心技术,解锁职业新可能 zt60.cn/CMPE9 拒收请回复R | FALSE | 拒收 | |
|
22 |
+
| [3条]刻刻: 60GB本地數據及5000分鐘本地通話30日組合成功開啟,餘額已被扣除,有效期至10/02/2025 23:59。請立即登入MyLink App bit.ly/MySimMyLink 進行增值,若賬戶餘額充足,60GB本地數據及5000分鐘本地通話30日組合將每30日以$38自動續期,並於餘額內自動... | FALSE | | |
|
23 |
+
| [3条]李JK老师-1228: 【湖南通信】尊敬的用户:您订购的卡已配送暂未签收,为了更好的为您服务,烦请回复:未拿到卡且需要回复1,卡在站点未取件回复2,已取件回复3。如有疑问请拨打4008155555 | TRUE | 配送、未签收、未取件 | 包裹物流 |
|
24 |
+
| [2条]李JK老师-1228: 【深势科技】Bohrium用户您好,您有共计余额20.00元的体验卡将于2024-12-24 23:59:59到期,请及时使用。 | FALSE | 共计余额 | |
|
25 |
+
| [3条]李JK老师-1228: 【讯飞开放平台】到期预警!尊敬的会员用户:您的个人级乐享会员,将于2024-12-08 00:00:00正式到期,截至目前仅剩7天。如您要继续使用,自即日起,7天内完成会员体验问卷http://1024-2019.iflytek.com/h5/vip-ques?t=2024-12-15,即可免费续约... | TRUE | 到期 | 到期提醒 |
|
26 |
+
| M丶D: 【中通快递】73547529665397放在丰巢的包裹请及时取件,如有问题致电15900077340、020-22504077。 | TRUE | 快递、取件 | 包裹物流 |
|
27 |
+
| [33条]斯煜[表情]Vince: 【中通快递】包裹已到深圳光明正大城商业街103号店,取件码3-5-1092。到店扫“取件二维码”,线上查询更方便!询19128399078 | TRUE | 快递、取件、取件码 | 包裹物流 |
|
28 |
+
| 王斯煜[表情]Vince 黑客松nv: 【申通快递】快递尾号7117已在代收点1天未取,请尽快取件,详询13434525312,最新快递状态请访问t.sto.cn/18PBt4 | TRUE | 快递、取件 | 包裹物流 |
|
29 |
+
| [2条]王斯煜[表情]Vince 黑客松nv: 【申通快递】包裹已到深圳光明正大城商业街103号店,取件码5-5-7117。到店扫“取件二维码”,线上查询更方便!询19128399078 | TRUE | 快递、取件码 | 包裹物流 |
|
30 |
+
| [2条]王斯煜[表情]Vince 黑客松nv: 【美团月付】您2月账单805.57元待还,最后还款日为本月22号,查账或立即还款点 dpurl.cn/HXFUxEba | TRUE | 账单、还款 | 支付 |
|
31 |
+
| [32条]斯煜[表情]Vince: 【美团月付】您2月账单805.57元待还,最后还款日为本月22号,查账或立即还款点 dpurl.cn/HXFUxEba | TRUE | 账单、还款 | 支付 |
|
32 |
+
| [30条]斯煜[表情]Vince: 【美团月付】您的2月账单805.57元需要付款,查看详情点击 dpurl.cn/80JQdKHa | TRUE | 账单、付款 | 支付 |
|
33 |
+
| [26条]斯煜[表情]Vince: Mox: 你的Mox Credit月結單已準備就緒。你可登入Mox應用程式查看。 | TRUE | 账单 | 支付 |
|
34 |
+
| 【建设银行】您账户8699于3月22日9时51分向支付宝-天弘基金管理有限公司支出人民币16.88元,可用余额7414.6元。 | FALSE | 可用余额 | |
|
35 |
+
| [3条]M丶D: 明天九点开会 | TRUE | 时间、开会 | 会议 |
|
36 |
+
| 【建设银行】您账户8699于3月22日9时42分向支付宝-天弘基金管理有限公司支出人民币16.88元,可用余额7431.48元。 | FALSE | 可用余额 | |
|
37 |
+
| 【建设银行】您账户8699于3月22日9时32分向支付宝-天弘基金管理有限公司支出人民币16.88元,可用余额7448.36元。 | FALSE | 可用余额 | |
|
38 |
+
| 【建设银行】您账户8699于3月22日9时12分向支付宝-天弘基金管理有限公司支出人民币16.88元,可用余额7465.24元。 | FALSE | 可用余额 | |
|
39 |
+
| 【建设银行】您账户8699于3月22日8时53分向支付宝-天弘基金管理有限公司支出人民币16.88元,可用余额7482.12元。 | FALSE | 可用余额 | |
|
40 |
+
| 【建设银行】您账户8699于3月22日18时2分向微信支付-羊城通缴费支出人民币2元,可用余额7397.6元。 | FALSE | 可用余额 | |
|
41 |
+
| 【招商银行】您的风险评估已到期,为避免错过我行优质产品信息,请及时重评!点击 cmbt.cn/a/zhV 去评估。如已完成或销户请忽略 | FALSE | 请忽略 | |
|
42 |
+
| 【建设银行】您账户8699于3月22日23时45分向微信支付-西苑出版社支出人民币180元,可用余额7215.6元。 | FALSE | 可用余额 | |
|
43 |
+
| 【建设银行】您账户8699于3月22日22时47分向微信支付-羊城通缴费支出人民币2���,可用余额7395.6元。 | FALSE | 可用余额 | |
|
44 |
+
| 【中通快递】73547529665397放在丰巢的包裹请及时取件,如有问题致电15900077340、020-22504077。 | TRUE | 快递、丰巢、取件 | 包裹物流 |
|
45 |
+
| 【建设银行】您账户8699于3月22日13时16分向微信支付-扫二维码付款支出人民币15元,可用余额7399.6元。 | FALSE | 可用余额 | |
|
46 |
+
| 【建设银行】您账户8699于3月21日8时11分向微信支付-小霞包点(香雪店)支出人民币5.5元,可用余额7566.14元。 | FALSE | 可用余额 | |
|
47 |
+
| 【建设银行】您账户8699于3月21日8时7分向微信支付-羊城通缴费支出人民币3.5元,可用余额7571.64元。 | FALSE | 可用余额 | |
|
48 |
+
| 【建设银行】您账户8699于3月21日20时5分向微信支付-羊城通缴费支出人民币1元,可用余额7519.38元。 | FALSE | 可用余额 | |
|
49 |
+
| 【中国农业银行】百万立减金限时抢,最高10元立减金等您拿,本月戳 go.abchina.com/k/CfK 直达,先到先得。拒收请回复R | FALSE | 拒收 | |
|
50 |
+
| 【建设银行】您账户8699于3月21日15时28分向支付宝-天弘基金管理有限公司支出人民币16.88元,可用余额7520.38元。 | FALSE | 可用余额 | |
|
51 |
+
| 【建设银行】您账户8699于3月21日15时28分向支付宝-理财-天弘基金管理有限公司支出人民币16.88元,可用余额7537.26元。 | FALSE | 可用余额 | |
|
52 |
+
| M丶D: 【话费账单】尊敬的150****9503客户,您02月01日- 02月28日共消费8.00元。主要消费项目包括: | FALSE | 共消费 | |
|
53 |
+
| 菜鳥包裹HK00083544922到達香港仔置富道19號置富花園19座地下B舖,提件碼7-1-1002,3個工作日内取。 | TRUE | 菜鸟、包裹、到达、提件码 | 包裹物流 |
|
54 |
+
| 您的集運單SF3148634434778因其他异常已拒收退回賣家,建議您聯系賣家處理,如有必要可申請退款 | TRUE | 申请退款 | 包裹物流 |
|
55 |
+
| 菜鳥包裹HK00083525678到達香港仔置富道19號置富花園19座地下B舖,提件碼7-1-1004,3個工作日内取。 | TRUE | 菜鸟、包裹、到达、提件码 | 包裹物流 |
|
56 |
+
| [10条]王斯煜[表情]Vince 黑客松nv: 菜鳥包裹HK00083544922到達香港仔置富道19號置富花園19座地下B舖,提件碼7-1-1002,3個工作日内取。 | TRUE | 菜鸟、包裹、到达、提件码 | 包裹物流 |
|
57 |
+
| [11条]王斯煜[表情]Vince 黑客松nv: 您的集運單SF3148634434778因其他异常已拒收退回賣家,建議您聯系賣家處理,如有必要可申請退款 | TRUE | 申请退款 | 包裹物流 |
|
58 |
+
| 【建设银行】您账户8699于3月21日12时20分向微信支付-扫二维码付款支出人民币12元,可用余额7554.14元。 | FALSE | 可用余额 | |
|
59 |
+
| [2条]AlisaGG陈G老师1010: 预约一个会和你 15分钟。我也正经了[加油] | TRUE | 预约、开会、时间 | 会议 |
|
60 |
+
| 【建设银行】您账户8699于3月20日20时3分向支付宝-天弘基金管理有限公司支出人民币16.88元,可用余额7574.12元。 | FALSE | 可用余额 | |
|
61 |
+
| 【建设银行】您账户8699于3月20日19时51分向微信支付-羊城通缴费支出人民币3.5元,可用余额7591元。 | FALSE | 可用余额 | |
|
62 |
+
| 【建设银行】您账户8699于3月20日18时36分向微信支付-羊城通缴费支出人民币1元,可用余额7594.5元。 | FALSE | 可用余额 | |
|
63 |
+
| WANG Siyu: @AlisaGG 6:00PM后 今晚 什么时候有空 | TRUE | 有空、时间 | 会议 |
|
64 |
+
| 【建设银行】您账户8699于3月27日8时3分向微信支付-羊城通缴费支出人民币3.5元,可用余额5240.93元。 | FALSE | 可用余额 | |
|
65 |
+
| 【建设银行】您账户8699于3月27日8时28分向微信支付-百度平台商家支出人民币18.95元,可用余额5221.98元。 | FALSE | 可用余额 | |
|
66 |
+
| 【建设银行】您账户8699于3月27日9时12分向支付宝-理财-天弘基金管理有限公司支出人民币16.88元,可用余额5205.1元。 | FALSE | 可用余额 | |
|
67 |
+
| 【广州银行】您尾号5138的卡片转入人民币550.00元,本期账单已还清。绑定广州银行信用卡官微实时查账。 | FALSE | 账单、还清 | |
|
68 |
+
| 【建设银行】您账户8699于3月27日9时26分向支付宝-黄敏达还款支出人民币550元,可用余额4655.1元。 | FALSE | 可用余额 | |
|
69 |
+
| 【建设银行】您账户8699于3月27日9时32分向��付宝-广州市自来水有限公司支出人民币17.58元,可用余额4637.52元。 | FALSE | 可用余额 | |
|
70 |
+
| 【建设银行】您账户8699于3月27日9时32分向支付宝-天弘基金管理有限公司支出人民币16.88元,可用余额4620.64元。 | FALSE | 可用余额 | |
|
71 |
+
| 【建设银行】您账户8699于3月27日9时48分向微信支付-壹伴助手支出人民币399元,可用余额4221.64元。 | FALSE | 可用余额 | |
|
72 |
+
| 【广州银行】黄敏达先生,您好。您的信用卡欠款未缴,请即日内还款,否则我行不排除根据客户协议委托第三方公司向您催缴。若已还款无需理会 | TRUE | 信用卡、还款 | 支付 |
|
73 |
+
| 【中国农业银行】尊敬的用户,您已超过1个月未登录账户,邀您点击 go.abchina.com/k/C7C 查看账户详情。拒收请回复R | FALSE | 拒收 | |
|
74 |
+
| 【建设银行】您账户8699于3月27日12时38分向微信支付-兰州拉面(佳大)支出人民币16元,可用余额4205.64元。 | FALSE | 可用余额 | |
|
75 |
+
| [23条]M丶D: 1000本链接: https://pan.baidu.com/s/13dV3m54iGE8oWgYtkARQPw?pwd=vwq3 提取码: vwq3 复制这段内容后打开百度网盘手机App,操作更方便哦 | FALSE | 百度网盘 | |
|
76 |
+
| 【建设银行】您账户8699于3月27日9时32分向支付宝-广州市自来水有限公司支出人民币17.58元,可用余额4637.52元。 | FALSE | 可用余额 | |
|
77 |
+
| 招商银行风险评估到期提醒 | FALSE | 到期提醒 | 提醒 |
|
78 |
+
| 【建设银行】您账户8699于3月27日9时32分向支付宝-广州市自来水有限公司支出人民币17.58元,可用余额4637.52元。 | FALSE | 可用余额 | |
|
79 |
+
| 重构工作完成时间 | FALSE | | 聊天 |
|
80 |
+
| 上课时间 | FALSE | | 提醒 |
|
81 |
+
| 明天上班 | FALSE | | 聊天 |
|
82 |
+
| 等待黄老师远程会议 | FALSE | 等待 | 聊天 |
|
83 |
+
| 确认代码与数据库兼容性 | FALSE | 确认 | |
|
84 |
+
| 补全项目说明书 | FALSE | | |
|
85 |
+
| 更新readme文档 | FALSE | | 待办 |
|
86 |
+
| 确认是否需要关防火墙 | FALSE | | |
|
87 |
+
| 使用企业邮箱 | FALSE | | |
|
88 |
+
| 处理手机验证问题 | FALSE | | |
|
89 |
+
| 分享达子的课程到飞书共享空间 | FALSE | | |
|
90 |
+
| 明天下午3点有会议 | TRUE | 会议 | 开会 |
|
91 |
+
| 确认会议时间 | FALSE | | 细节讨论 |
|
92 |
+
| 把代码上传到云上的自己的分支 | FALSE | | 讨论 |
|
93 |
+
| "大数据需求准入评审周会,时间:11:00 -12:30 参会人:所有 地点:2栋17楼-用户-配有飞书会议-优先5人以上预定(12)深圳新一个代2栋" | TRUE | 参会人、时间、地点、会议 | 开会 |
|
94 |
+
| [26条]斯煜[表情]Vince: Mox: 你的Mox Credit月結單已準備就緒。你可登入Mox應用程式查看。 | TRUE | 账单 | 缴费 |
|
95 |
+
| 飞书验证码、处理飞书邮箱问题 | FALSE | | 无效信息 |
|
96 |
+
| Python版本确认 | FALSE | | 无效信息 |
|
LLM/filter_message/prompt.txt
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## 人工标注数据
|
2 |
+
|
3 |
+
| content | 是否正负样本 | 关键词 | 类型 |
|
4 |
+
| --- | --- | --- | --- |
|
5 |
+
| 【建设银行】您账户8699于3月26日8时6分向微信支付-羊城通缴费支出人民币3.5元,可用余额5285.33元。 | FALSE | 余额 | 提醒 |
|
6 |
+
| 【建设银行】您账户8699于3月26日8时11分向微信支付-小霞包点(香雪店)支出人民币5.5元,可用余额5279.83元。 | FALSE | 余额 | 提醒 |
|
7 |
+
| [3条]LJK86: 您的号卡已在配送途中,物流单号SF3147624215612,点击:http://t.hn.189.cn/EZv6bazD ,查询物流进度。如已签收请在有效期内先实名激活。客服热线4008155555【中国电信】 | FALSE | | |
|
8 |
+
| AlisaGG陈G老师1010: 【招商银行】您本月办理分期还款可享限时2.3折优惠!打开掌上生活APP,搜“分期还款”立享折扣优惠!资格实时测评,拒收请回复R | FALSE | | |
|
9 |
+
| "王斯煜[表情]Vince 黑客松nv: @AlisaGG @LJK86 明天中午再对 WANG Siyu邀请你加入飞书视频会议<br>会议主题:FilterLLM ToDoGenLLM PR Merge讨论<br>会议时间:3月31日 (今天) 12:00 - 12:30 (GMT+8)<br>会议 ID:656 445 907<br>会议链接:https://..." | TRUE | 飞书视频会议 | 会议 |
|
10 |
+
| [6条]王斯煜[表情]Vince 黑客松nv: Your Mint 3-Month, Unlimited plan expires in 2 days. Log into your account at my.mintmobile.com or via our app to make a payment and keep your Mint se... | FALSE | | |
|
11 |
+
| [4条]斯煜[表情]Vince: 【美团月付】您2月账单805.57元待还,最后还款日为本月22号,查账或立即还款点 dpurl.cn/HXFUxEba | TRUE | 还款 | 待付(还)款 |
|
12 |
+
| [4条]斯煜[表情]Vince: 【中通快递】73547529665397放在丰巢的包裹请及时取件,如有问题致电15900077340、020-22504077。 | TRUE | 快递、取件 | 物流取件 |
|
13 |
+
| [6条]斯煜[表情]Vince: 尊敬的*斯煜,您在我行办理的1笔个人贷款需于2024年12月08日17:00前还款,当期还款金额本息合计999999.84元,请您留意尾号0455的账户可用余额是否充足,避免因贷款逾期影响个人征信。具体贷款信息可通过工行网上银行、手机银行或致电贷款经办行查询。【工商银行】 | TRUE | 还款 | 待付(还)款 |
|
14 |
+
| [6条]斯煜[表情]Vince: Your Mint 3-Month, Unlimited plan expires in 2 days. Log into your account at my.mintmobile.com or via our app to make a payment and keep your Mint se... | FALSE | | 提醒 |
|
15 |
+
| AlisaGG陈G老师1010: 【驿收发】您的邮政包裹已到凯丰花园2栋驿站,请23:00前凭5-5-6530来取,详询18320926368 | TRUE | | 物流取件 |
|
16 |
+
| [2条]AlisaGG: 【小象超市】您好,我是小象超市骑手,【美团智能外卖柜】您的外卖已送至新一代产业园2栋4号柜(面向马路侧),格口号:27,取件码:9310,存柜超过12小时将被清理,请及时取件。 | TRUE | 取件码 | 物流取件 |
|
17 |
+
| [2条]AlisaGG陈G老师1010: 【小象超市】您好,我是小象超市骑手,【美团智能外卖柜】您的外卖已送至新一代产业园2栋4号柜(面向马路侧),格口号:27,取件码:9310,存柜超过12小时将被清理,请及时取件。 | TRUE | 取件码 | 物流取件 |
|
18 |
+
| "[3条]AlisaGG陈G老师1010: 取件通知<br>取件码: 11724837<br>运单号: 464285154986072<br>取件地址: 深圳新一代产业园P2出入口内侧4号丰巢柜<br>配送人员: 18124519013<br>计费规则: 查看详情" | TRUE | 取件码、运单号 | 物流取件 |
|
19 |
+
| "[4条]AlisaGG陈G老师1010: 取件再次提醒<br>取件码:: 27696201<br>配送公司:: 申通快递<br>运单号:: 777293635831671<br>配送员手机:: 13392809673<br>取件地址:: 深圳新一代产业园P2出入口内侧4号丰巢柜" | TRUE | 取件码、运单号 | 物流取件 |
|
20 |
+
| [5条]李JK老师-1228: 【菜鸟驿站】请凭140-3-1019到菜鸟驿站取件,查询详情u.cainiao.com/53h4bSr7zrh | TRUE | 取件码、菜鸟驿站 | 物流取件 |
|
21 |
+
| "斯煜[表情]Vince: WANG Siyu邀请你加入飞书视频会议<br>会议主题:FilterLLM ToDoGenLLM PR Merge讨论<br>会议时间:3月31日 (今天) 12:00 - 12:30 (GMT+8)<br>会议 ID:656 445 907<br>会议链接:https://vc.feishu.cn/j/65644590 ..." | TRUE | 飞书视频会议 | 会议 |
|
22 |
+
| 刻刻: 【停机前提醒】尊敬的移动客户,您好!您的账户余额不足被限制使用。现提醒您需充值缴费至少32.49元,以确保您继续享受畅通的通信服务。诚邀您一键办理自动充服务,自动充值缴费更轻松:https://dx.10086.cn/7WyRLA 。 心级服务、让爱连接【中国移动】 | TRUE | 缴费、余额不足 | 待付(还)款 |
|
23 |
+
| AlisaGG: 【小象超市】您的商品已放置在门口,因有易碎等商品请尽快取回,如有疑问请联系15794935204 。祝您生活愉快! | FALSE | | 提醒 |
|
24 |
+
| 【美团月付】您4月账单8937.95元待还,最后还款日为本月8号,查账或立即还款点 dpurl.cn/kVln1cqa | TRUE | 还款 | 待付(还)款 |
|
25 |
+
| 建设银行】您账户8699于3月24日12��22分向微信支付-扫二维码付款支出人民币15元,可用余额5433.72元。 | FALSE | 可用余额 | 提醒 |
|
26 |
+
| 【建设银行】您账户8699于3月24日12时11分向支付宝-天猫-深圳市升景科技有限公司支出人民币1889元,可用余额5465.6元。 | FALSE | 可用余额 | |
|
27 |
+
| 【建设银行】您账户8699于3月24日12时11分向支付宝-天弘基金管理有限公司支出人民币16.88元,可用余额5448.72元。 | FALSE | 可用余额 | |
|
28 |
+
| 【驿收发】您的京东包裹已到惜福堂对面快递驿站,请21:00前凭2-0147来取,详询16675200600 | TRUE | 驿站、包裹、取件 | 物流取件 |
|
29 |
+
| 【中国农业银行】为您特别准备了一份专属月度账单,登掌银搜“月度账单”或戳 go.abchina.com/k/CS7 查收。拒收请回复R | FALSE | 拒收 | 提醒 |
|
30 |
+
| 刻刻: 【火山引擎】亲爱的用户,您有千万DeepSeek模型额度待领取哦!每邀请1位新用户注册和使用,最高可获得130元代金券,多邀多得上不封顶!点击下载海报,分享到更多渠道:szacq.cn/ewyeb/ 拒收请回复R | FALSE | 拒收 | 提醒 |
|
31 |
+
| M丶D: 【阿里云推广】尊敬的mandahuang,老友季钜惠!云服务器 99元/年,新购续费同享!戳>><url id="cvlp38lgsoaarffpph8g" type="url" status="parsed" title="阿里云权益中心" wc="3700">https://t.aliyun.com/TIFqsCZY </url> 立即抢购!拒收请回复R | FALSE | 拒收 | 提醒 |
|
32 |
+
| 【AlipayHK】於3月19日 08:15需要通過Apple 服務(Apple services)待付款支付HKD8.00 | TRUE | 账单、待付款 | 支付 |
|
33 |
+
| [2条]李JK老师-1228: 【百度】亲爱的开发者您好,文心智能体年底送福利,即日起至12月30日,创建并提交您的智能体,即有机会获得现金卡福利,最高500元,多档奖励,中奖机会大!登陆天数越多,创建有创意,中奖率越高!福利倒计时!点击参与!<url id="cvlp38lgsoaarffpph9g" type="url" status="failed" title="" wc="0">https://agents.baidu.com/activity/detail/13 </url> | FALSE | 送福利、有机会 | 无效信息 |
|
34 |
+
| [5条]LJK86: 【百度】亲爱的开发者您好,文心智能体年底送福利,即日起至12月30日,创建并提交您的智能体,即有机会获得现金卡福利,最高500元,多档奖励,中奖机会大!登陆天数越多,创建有创意,中奖率越高!福利倒计时!点击参与!<url id="cvlp38lgsoaarffpph9g" type="url" status="failed" title="" wc="0">https://agents.baidu.com/activity/detail/13 </url> | FALSE | 送福利、有机会 | 无效信息 |
|
35 |
+
| [3条]M丶D: 【韵达快递】亲434466097408983超10小时未取出,如需帮助或有问题请致电15900077340 、020-89725127 | TRUE | 取件 | 物流取件 |
|
36 |
+
| [2条]LJK86: 【智谱AI】亲爱的开发者您好,感谢您参与智谱开放平台满意度调研,您的智谱清言月卡奖励正在发放,请在链接中输入您问卷中填写的手机号查询礼品码,前往PC/APP智谱清言会员充值页,选择“礼品码兑换”。查询链接:<url id="cvlp38lgsoaarffppha0" type="url" status="parsed" title="Feishu - Log in" wc="339">https://zhipu-ai.feishu.cn/share/base/query/shr </url> ... | FALSE | 满意度调研 | 无效信息 |
|
37 |
+
| [2条]M丶D: 【中国电信】流量满满,温暖相伴,我们特别为你准备了预存领10GB的流量大礼,无论是与家人视频通话,还是朋友间分享趣事,都能畅通无阻,让爱不断线。马上戳 <url id="cvlp38lgsoaarffpphag" type="url" status="parsed" title="预存领流量" wc="3306">https://vipxjzl.mini189.cn/BG/ </url> 了解吧,具体以实际页面展示为准,如已办理请忽略,转发无效。拒收请回复R | FALSE | 拒收 | |
|
38 |
+
| [3条]王斯煜[表情]Vince 黑客松nv: 【中通快递】包裹已到深圳光明正大城商业街103号店,取件码3-5-1092。到店扫“取件二维码”,线上查询更方便!询19128399078 | TRUE | 快递、包裹、取件码 | 物流取件 |
|
39 |
+
| [2条]李JK老师-1228: 【51CTO学堂】《DeepSeek训练营》火爆来袭!0元报名,学习AI核心技术,解锁职业新可能 zt60.cn/CMPE9 拒收请回复R | FALSE | 拒收 | 无效信息 |
|
40 |
+
| [3条]刻刻: 60GB本地數據及5000分鐘本地通話30日組合成功開啟,餘額已被扣除,有效期至10/02/2025 23:59。請立即登入MyLink App bit.ly/MySimMyLink 進行增值,若賬戶餘額充足,60GB本地數據及5000分鐘本地通話30日組合將每30日以$38自動續期,並於餘額內自動... | FALSE | | 提醒 |
|
41 |
+
| [3条]李JK老师-1228: 【湖南通信】尊敬的用户:您订购的卡已配送暂未签收,为了更好的为您服务,烦请回复:未拿到卡且需要回复1,卡在站点未取件回复2,已取件回复3。如有疑问请拨打4008155555 | TRUE | 未签收 | 包裹物流 |
|
42 |
+
| [2条]李JK老师-1228: 【深势科技】Bohrium用户您好,您有共计余额20.00元的体验卡将于2024-12-24 23:59:59到期,请及时使用。 | FALSE | 共计余额 | 提醒 |
|
43 |
+
| [3条]李JK老师-1228: 【讯飞开放平台】到期预警!尊敬的会员用户:您的个人级乐享会员,将于2024-12-08 00:00:00正式到期,截至目前仅剩7天。如您要继续使用,自即日起,7天内完成会员体验问卷<url id="cvlp38lgsoaarffpphb0" type="url" status="parsed" title="乐享会员调查问卷" wc="1540">http://1024-2019.iflytek.com/h5/vip-ques?t=2024-12-15 </url> ,即可免费续约... | FALSE | 到期 | 提醒 |
|
44 |
+
| M中通快递】73547529665397放在丰巢的包裹请及时取件,如有问题致电15900077340、020-22504077。 | TRUE | 丰巢、取件 | 物流取件 |
|
45 |
+
| [33条]斯煜[表情]Vince: 【中通快递】包裹已到深圳光明正大城商业街103号店,取件码3-5-1092。到店扫“取件二维码”,线上查询更方便!询19128399078 | TRUE | | 物流取件 |
|
46 |
+
| 王斯煜[表情]Vince 黑客松nv: 【申通快递】快递尾号7117已在代收点1天未取,请尽快取件,详询13434525312,最新快递状态请访问t.sto.cn/18PBt4 | TRUE | | 物流取件 |
|
47 |
+
| [2条]王斯煜[表情]Vince 黑客松nv: 【美团月付】您2月账单805.57元待还,最后还款日为本月22号,查账或立即还款点 dpurl.cn/HXFUxEba | TRUE | 账单、还款 | 物流取件 |
|
48 |
+
| [32条]斯煜[表情]Vince: 【美团月付】您2月账单805.57元待还,最后还款日为本月22号,查账或立即还款点 dpurl.cn/HXFUxEba | TRUE | 账单、还款 | 待付(还)款 |
|
49 |
+
| [26条]斯煜[表情]Vince: Mox: 你的Mox Credit月結單已準備就緒。你可登入Mox應用程式查看。 | TRUE | 账单 | 待付(还)款 |
|
50 |
+
| 【建设银行】您账户8699于3月22日9时51分向支付宝-天弘基金管理有限公司支出人民币16.88元,可用余额7414.6元。 | FALSE | 可用余额 | 提醒 |
|
51 |
+
| [3条]M丶D: 明天九点开会 | TRUE | 时间、开会 | 会议 |
|
52 |
+
| 【建设银行】您账户8699于3月22日9时42分向支付宝-天弘基金管理有限公司支出人民币16.88元,可用余额7431.48元。 | FALSE | 可用余额 | 提醒 |
|
53 |
+
| 【建设银行】您账户8699于3月22日18时2分向微信支付-羊城通缴费支出人民币2元,可用余额7397.6元。 | FALSE | 可用余额 | |
|
54 |
+
| 【招商银行】您的风险评估已到期,为避免错过我行优质产品信息,请及时重评!点击 cmbt.cn/a/zhV 去评估。如已完成或销户请忽略 | FALSE | 风险评估 | 提醒 |
|
55 |
+
| 【建设银行】您账户8699于3月22日23时45分向微信支付-西苑出版社支出人民币180元,可用余额7215.6元。 | FALSE | 可用余额 | 提醒 |
|
56 |
+
| 【建设银行】您账户8699于3月22日22时47分向微信支付-羊城通缴费支出人民币2元,可用余额7395.6元。 | FALSE | 可用余额 | 提醒 |
|
57 |
+
| 【中通快递】73547529665397放在丰巢的包裹请及时取件,如有问题致电15900077340、020-22504077。 | TRUE | 快递、丰巢、取件 | 包裹物流 |
|
58 |
+
| 【建设银行】您账户8699于3月22日13时16分向微信支付-扫二维码付款支出人民币15元,可用余额7399.6元。 | FALSE | 可用余额 | |
|
59 |
+
| 【建设银行】您账户8699于3月21日8时11分向微信支付-小霞包点(香雪店)支出人民币5.5元,可用余额7566.14元。 | FALSE | 可用余额 | |
|
60 |
+
| 【建设银行】您账户8699于3月21日8时7分向微信支付-羊城通缴费支出人民币3.5元,可用余额7571.64元。 | FALSE | 可用余额 | |
|
61 |
+
| 【建设银行】您账户8699于3月21日20时5分向微信支付-羊城通缴费支出人民币1元,可用余额7519.38元。 | FALSE | 可用余额 | |
|
62 |
+
| 【中国农业银行】百万立减金限时抢,最高10元立减金等您拿,本月戳 go.abchina.com/k/CfK 直达,先到先得。拒收请回复R | FALSE | 拒收 | 提醒 |
|
63 |
+
| 【建设银行】您账户8699于3月21日15时28分向支付宝-天弘基金管理有限公司支出人民币16.88元,可用余额7520.38元。 | FALSE | 可用余额 | 提醒 |
|
64 |
+
| 【建设银行】您账户8699于3月21日15时28分向支付宝-理财-天弘基金管理有限公司支出人民币16.88元,可用余额7537.26元。 | FALSE | 可用余额 | 提醒 |
|
65 |
+
| M丶D: 【话费账单】尊敬的150****9503客户,您02月01日- 02月28日共消费8.00元。主要消费项目包括: | FALSE | 共消费 | 提醒 |
|
66 |
+
| 菜鳥包裹HK00083544922到達香港仔置富道19號置富花園19座地下B舖,提件碼7-1-1002,3個工作日内取。 | TRUE | | 物流取件 |
|
67 |
+
| 您的集運單SF3148634434778因其他异常已拒收退回賣家,建議您聯系賣家處理,如有必要可申請退款 | FALSE | | 提醒 |
|
68 |
+
| 菜鳥包裹HK00083525678到達香港仔置富道19號置富花園19座地下B舖,提件碼7-1-1004,3個工作日内取。 | TRUE | | 物流取件 |
|
69 |
+
| [10条]王斯煜[表情]Vince 黑客松nv: 菜鳥包裹HK00083544922到達香港仔置富道19號置富花園19座地下B舖,提件碼7-1-1002,3個工作日内取。 | TRUE | | 物流取件 |
|
70 |
+
| 【建设银行】您账户8699于3月21日12时20分向微信支付-扫二维码付款支出人民币12元,可用余额7554.14元。 | FALSE | 可用余额 | |
|
71 |
+
| [2条]AlisaGG陈G老师1010: 预约一个会和你 15分钟。我也正经了[加油] | TRUE | | 会议 |
|
72 |
+
| 【建设银行】您账户8699于3月27日8时3分向微信支付-羊城通缴费支出人民币3.5元,可用余额5240.93元。 | FALSE | 可用余额 | 提醒 |
|
73 |
+
| 【建设银行】您账户8699于3月27日8时28分向微信支付-百度平台商家支出人民币18.95元,可用余额5221.98元。 | FALSE | 可用余额 | 提醒 |
|
74 |
+
| 【建设银行】您账户8699于3月27日9时12分向支付宝-理财-天弘基金管理有限公司支出人民币16.88元,可用余额5205.1元。 | FALSE | 可用余额 | 提醒 |
|
75 |
+
| 【广州银行】您尾号5138的卡片转入人民币550.00元,本期账单已还清。绑定广州银行信用卡官微实时查账。 | FALSE | 账单、还清 | 提醒 |
|
76 |
+
| 【建设银行】您账户8699于3月27日9时26分向支付宝-黄敏达还款支出人民币550元,可用余额4655.1元。 | FALSE | 可用余额 | 提醒 |
|
77 |
+
| 【建设银行】您账户8699于3月27日9时32分向支付宝-广州市自来水有限公司支出人民币17.58元,可用余额4637.52元。 | FALSE | 可用余额 | 提醒 |
|
78 |
+
| 【建设银行】您账户8699于3月27日9时32分向支付宝-天弘基金管理有限公司支出人民币16.88元,可用余额4620.64元。 | FALSE | 可用余额 | 提醒 |
|
79 |
+
| 【建设银行】您账户8699于3月27日9时48分向微信支付-壹伴助手支出人民币399元,可用余额4221.64元。 | FALSE | 可用余额 | 提醒 |
|
80 |
+
| 【广州银行】黄敏达先生,您好。您的信用卡欠款未缴,请即日内还款,否则我行不排除根据客户协议委托第三方公司向您催缴。若已还款无需理会 | TRUE | 信用卡、还款 | 待付(还)款 |
|
81 |
+
| 【中国农业银行】尊敬的用户,您已超过1个月未登录账户,邀您点击 go.abchina.com/k/C7C 查看账户详情。拒收请回复R | FALSE | 拒收 | 提醒 |
|
82 |
+
| 【建设银行】您账户8699于3月27日12时38分向微信支付-兰州拉面(佳大)支出人民币16元,可用余额4205.64元。 | FALSE | 可用余额 | 提醒 |
|
83 |
+
| [23条]M丶D: 1000本链接: <url id="cvlp38lgsoaarffpphbg" type="url" status="failed" title="" wc="0">https://pan.baidu.com/s/13dV3m54iGE8oWgYtkARQPw?pwd=vwq3 </url> 提取码: vwq3 复制这段内容后打开百度网盘手机App,操作更方便哦 | FALSE | 百度网盘 | 提醒 |
|
84 |
+
| 【建设银行】您账户8699于3月27日9时32分向支付宝-广州市自来水有限公司支出人民币17.58元,可用余额4637.52元。 | FALSE | 可用余额 | 提醒 |
|
85 |
+
| 招商银行风险评估到期提醒 | FALSE | 到期提醒 | 提醒 |
|
86 |
+
| 【建设银行】您账户8699于3月27日9时32分向支付宝-广州市自来水有限公司支出人民币17.58元,可用余额4637.52元。 | FALSE | 可用余额 | 提醒 |
|
87 |
+
| 明天下午3点有会议 | TRUE | | 会议 |
|
88 |
+
| 确认会议时间 | FALSE | | 无效信息 |
|
89 |
+
| 把代码上传到云上的自己的分支 | FALSE | | 无效信息 |
|
90 |
+
| "大数据需求准入评审周会,时间:11:00 -12:30 参会人:所有 地点:2栋17楼-用户-配有飞书会议-优先5人以上预定(12)深圳新一个代2栋" | TRUE | 飞书会议 | 会议 |
|
LLM/filter_message/requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
openai
|
2 |
+
pyyaml
|
3 |
+
pymysql
|
4 |
+
black
|
LLM/orchestrator.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import yaml
|
3 |
+
import threading
|
4 |
+
import importlib
|
5 |
+
from pathlib import Path
|
6 |
+
import sys
|
7 |
+
# import yaml # Removed yaml import
|
8 |
+
import mysql.connector as pymysql # Use mysql-connector-python alias
|
9 |
+
|
10 |
+
# Import config loader from todogen_LLM
|
11 |
+
from todogen_LLM.config_loader import get_mysql_config
|
12 |
+
|
13 |
+
# Removed local load_config function and CONFIG/MYSQL_CONFIG globals
|
14 |
+
# # 配置
|
15 |
+
def load_config():
|
16 |
+
config_path = Path(__file__).parent / "todogen_LLM" / "todogen_LLM_config.yaml"
|
17 |
+
with open(config_path, 'r', encoding='utf-8') as f:
|
18 |
+
return yaml.safe_load(f)
|
19 |
+
|
20 |
+
CONFIG = load_config()
|
21 |
+
MYSQL_CONFIG = CONFIG['mysql']
|
22 |
+
|
23 |
+
# 数据库
|
24 |
+
def get_db_conn():
|
25 |
+
mysql_config = get_mysql_config()
|
26 |
+
return pymysql.connect(
|
27 |
+
host=mysql_config['host'],
|
28 |
+
port=mysql_config.get('port', 3306),
|
29 |
+
user=mysql_config['user'],
|
30 |
+
password=mysql_config['password'],
|
31 |
+
database=mysql_config['database'],
|
32 |
+
ssl_ca=mysql_config['ssl_ca'], # path
|
33 |
+
ssl_disabled=False, # enabled
|
34 |
+
charset='utf8mb4',
|
35 |
+
autocommit=True
|
36 |
+
)
|
37 |
+
|
38 |
+
def get_latest_update_time(conn):
|
39 |
+
with conn.cursor() as cursor:
|
40 |
+
cursor.execute("SELECT MAX(date) FROM Messages")
|
41 |
+
result = cursor.fetchone()
|
42 |
+
return result[0]
|
43 |
+
|
44 |
+
#filter_llm主入口
|
45 |
+
def run_filter_llm():
|
46 |
+
sys.path.append(str(Path(__file__).parent / 'filter_llm'))
|
47 |
+
main = importlib.import_module('main')
|
48 |
+
if hasattr(main, 'main'):
|
49 |
+
main.main()
|
50 |
+
else:
|
51 |
+
raise RuntimeError('filter_llm.main.py未找到main函数')
|
52 |
+
|
53 |
+
#todogen_LLM主入口
|
54 |
+
def run_todogen_llm():
|
55 |
+
sys.path.append(str(Path(__file__).parent / 'todogen_LLM'))
|
56 |
+
todogen = importlib.import_module('todogen_llm')
|
57 |
+
if hasattr(todogen, 'main'):
|
58 |
+
todogen.main()
|
59 |
+
else:
|
60 |
+
if hasattr(todogen, 'load_formatted_data') and hasattr(todogen, 'process_data'):
|
61 |
+
data = todogen.load_formatted_data()
|
62 |
+
todogen.process_data(data)
|
63 |
+
else:
|
64 |
+
raise RuntimeError('todogen_llm.py未找到main或核心处理函数')
|
65 |
+
|
66 |
+
# 调用Notify主入口
|
67 |
+
#// def run_notify():
|
68 |
+
#// sys.path.append(str(Path(__file__).parent / 'Notify')) # Path needs update if kept
|
69 |
+
#// notify = importlib.import_module('notifyMain')
|
70 |
+
#// if hasattr(notify, 'main'):
|
71 |
+
#// notify.main()
|
72 |
+
#// else:
|
73 |
+
#// raise RuntimeError('Notify/notifyMain.py未找到main函数')
|
74 |
+
|
75 |
+
# 监听messages表并联动
|
76 |
+
def monitor_and_orchestrate(interval=5):
|
77 |
+
conn = get_db_conn()
|
78 |
+
last_update = get_latest_update_time(conn)
|
79 |
+
print(f"初始messages表更新时间: {last_update}")
|
80 |
+
while True:
|
81 |
+
time.sleep(interval)
|
82 |
+
try:
|
83 |
+
current_update = get_latest_update_time(conn)
|
84 |
+
if current_update != last_update:
|
85 |
+
print(f"检测到messages表有更新: {current_update}, 开始联动执行...")
|
86 |
+
run_filter_llm()
|
87 |
+
run_todogen_llm()
|
88 |
+
last_update = current_update
|
89 |
+
else:
|
90 |
+
print("无更新,继续监听...")
|
91 |
+
except Exception as e:
|
92 |
+
print(f"监听或执行过程中发生错误: {e}")
|
93 |
+
|
94 |
+
if __name__ == "__main__":
|
95 |
+
monitor_and_orchestrate(interval=5)
|
LLM/requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
black
|
2 |
+
python-dateutil
|
3 |
+
mysql-connector
|
4 |
+
mysql-connector-python>=8.0.0
|
5 |
+
schedule>=1.2.0
|
6 |
+
pyyaml>=6.0.1
|
7 |
+
python-dotenv
|
8 |
+
pymysql
|
9 |
+
PyYAML
|
10 |
+
openai
|
11 |
+
tqdm
|
12 |
+
|
LLM/todogen_LLM/FalsePositive_few_shot.txt
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# False Positive Few-Shot Examples
|
2 |
+
# (Non-actionable messages or pure notifications)
|
3 |
+
# 请用实际的、有代表性的例子替换以下内容
|
4 |
+
|
5 |
+
## Example 1 (纯粹通知)
|
6 |
+
Input Message:
|
7 |
+
{"123456789":"[通知]您的账户安全设置已更新。"}
|
8 |
+
|
9 |
+
Expected Output JSON:
|
10 |
+
```json
|
11 |
+
{"123456789":{"is_todo": false, "end_time":"null","location":"null","todo_content":"账号安全更新", "urgency": "unimportant"}}
|
12 |
+
```
|
13 |
+
|
14 |
+
## Example 2 (垃圾/广告信息)
|
15 |
+
Input Message:
|
16 |
+
{"987654321":"【优惠促销】限时抢购!全场商品低至一折,点击链接查看详情:xxx.com"}
|
17 |
+
|
18 |
+
Expected Output JSON:
|
19 |
+
```json
|
20 |
+
{"987654321":{"is_todo": false, "end_time":"null","location":"null","todo_content":"广告推销", "urgency": "unimportant"}}
|
21 |
+
```
|
22 |
+
|
23 |
+
## Example 3 (已完成/过期信息)
|
24 |
+
Input Message:
|
25 |
+
{"112233445":"[提醒]您昨天预约的会议已结束。"}
|
26 |
+
|
27 |
+
Expected Output JSON:
|
28 |
+
```json
|
29 |
+
{"112233445":{"is_todo": false, "end_time":"null","location":"null","todo_content":"过期内容", "urgency": "unimportant"}}
|
30 |
+
```
|
31 |
+
|
LLM/todogen_LLM/TruePositive_few_shot.txt
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# True Positive Few-Shot Examples
|
2 |
+
# (Actionable messages that should result in a to-do)
|
3 |
+
|
4 |
+
## Example 1
|
5 |
+
Input Message:
|
6 |
+
{"323231519":"[2条]李JK老师-1228: 【深势科技】Bohrium用户您好,您有共计余额20.00元的体验卡将于2024-12-24 23:59:59到期,请及时使用。"}
|
7 |
+
|
8 |
+
Expected Output JSON:
|
9 |
+
```json
|
10 |
+
{"323231519":{"is_todo": true, "end_time":"2024-12-24 23:59:59","location":"线上平台:【深势科技】Bohrium","todo_content":"请尽快使用20元体验卡", "urgency": "important"}}
|
11 |
+
```
|
12 |
+
|
13 |
+
## Example 2
|
14 |
+
Input Message:
|
15 |
+
{"331150112": "开始日期为2025-03-31T15:01:37,内容源于'ASAP Sample',[2条]AlisaGG: 【小象超市】您好,我是小象超市骑手,【美团智能外卖柜】您的外卖已送至新一代产业园2栋4号柜(面向马路侧),格口号:27,取件码:9310,存柜超过12小时将被清理,请及时取件。"}
|
16 |
+
|
17 |
+
Expected Output JSON:
|
18 |
+
```json
|
19 |
+
{"331150112": {"is_todo": true, "end_time": "2025-04-01T03:01:37", "location": "线下:新一代产业园2栋4号柜", "todo_content": "取快递(格口号:27, 取件码:9310)", "urgency": "urgent"}}
|
20 |
+
```
|
21 |
+
|
22 |
+
## Example 3
|
23 |
+
Input Message:
|
24 |
+
{"331150111": "开始日期为2025-03-31T15:01:36,内容源于'ASAP Sample',AlisaGG: 【驿收发】您的邮政包裹已到凯丰花园2栋驿站,请23:00前凭5-5-6530来取,详询18320926368"}
|
25 |
+
|
26 |
+
Expected Output JSON:
|
27 |
+
```json
|
28 |
+
{"331150111": {"is_todo": true, "end_time": "2025-03-31T23:00:00", "location": "线下:凯丰花园2栋驿站", "todo_content": "取快递(单号:5-5-6530)", "urgency": "important"}}
|
29 |
+
```
|
30 |
+
|
31 |
+
## Example 4
|
32 |
+
Input Message:
|
33 |
+
{"323231510":"[3条]王斯煜[表情]Vince 黑客松nv: 【中通快递】包裹已到深圳光明正大城商业街103号店,取件码3-5-1092。到店扫"取件二维码",线上查询更方便!询19128399078"}
|
34 |
+
|
35 |
+
Expected Output JSON:
|
36 |
+
```json
|
37 |
+
{"323231510":{"is_todo": true, "end_time":"null","location":"线下地点:深圳光明正大城商业街103号店","todo_content":"快递取件,取件码为3-5-1092", "urgency": "important"}}
|
38 |
+
```
|
39 |
+
|
40 |
+
## Example 5
|
41 |
+
Input Message:
|
42 |
+
{"323231172":"[24条]斯煜[表情]Vince: 【高德打车】您有车费尚未支付,为不影响乘车信用,请前往高德地图app处理,或点击l.amap.com/2i0mBifpr 支付"}
|
43 |
+
|
44 |
+
Expected Output JSON:
|
45 |
+
```json
|
46 |
+
{"323231172":{"is_todo": true, "end_time":"null","location":"线上平台: 高德地图app","todo_content":"未支付车费,请点击l.amap.com/2i0mBifpr 支付", "urgency": "urgent"}}
|
47 |
+
```
|
48 |
+
|
49 |
+
## Example 6
|
50 |
+
Input Message:
|
51 |
+
{"405091409":"【圆通速递】快件尾号1014的包裹已送至(家门口)详询18038103314"}
|
52 |
+
|
53 |
+
Expected Output JSON:
|
54 |
+
```json
|
55 |
+
{"405091409":{"is_todo": true, "end_time":"null","location":"线下:家门口","todo_content":"取圆通快递(尾号1014)", "urgency": "important"}}
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
|
LLM/todogen_LLM/__pycache__/compare_data.cpython-312.pyc
ADDED
Binary file (9.54 kB). View file
|
|
LLM/todogen_LLM/__pycache__/config_loader.cpython-312.pyc
ADDED
Binary file (1.88 kB). View file
|
|
LLM/todogen_LLM/__pycache__/database_of_messages.cpython-312.pyc
ADDED
Binary file (11.5 kB). View file
|
|
LLM/todogen_LLM/__pycache__/export_todolist.cpython-312.pyc
ADDED
Binary file (3.36 kB). View file
|
|
LLM/todogen_LLM/__pycache__/filter_message_list.cpython-312.pyc
ADDED
Binary file (2.15 kB). View file
|
|
LLM/todogen_LLM/__pycache__/filter_useful_data_to_dict.cpython-312.pyc
ADDED
Binary file (8.05 kB). View file
|
|
LLM/todogen_LLM/__pycache__/logger_config.cpython-312.pyc
ADDED
Binary file (2.33 kB). View file
|
|
LLM/todogen_LLM/__pycache__/path_validator.cpython-312.pyc
ADDED
Binary file (814 Bytes). View file
|
|
LLM/todogen_LLM/__pycache__/receiving_useful_messages.cpython-312.pyc
ADDED
Binary file (2.98 kB). View file
|
|
LLM/todogen_LLM/__pycache__/todogen_llm.cpython-312.pyc
ADDED
Binary file (21.1 kB). View file
|
|
LLM/todogen_LLM/compare_data.py
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
import copy
|
5 |
+
from config_loader import get_paths
|
6 |
+
from datetime import datetime
|
7 |
+
|
8 |
+
from export_todolist import export_todolist_to_json
|
9 |
+
from receiving_useful_messages import main
|
10 |
+
|
11 |
+
# --- 配置与辅助函数 ---
|
12 |
+
sys.stdout.reconfigure(encoding='utf-8')
|
13 |
+
|
14 |
+
def convert_datetime(obj):
|
15 |
+
"""自定义JSON序列化处理器"""
|
16 |
+
if isinstance(obj, datetime):
|
17 |
+
return obj.isoformat()
|
18 |
+
raise TypeError(f"Type {type(obj)} not serializable")
|
19 |
+
|
20 |
+
def load_json_data(file_path):
|
21 |
+
"""加载JSON文件数据,处理错误并确保返回列表"""
|
22 |
+
if not os.path.exists(file_path):
|
23 |
+
print(f"[错误] 文件未找到: {file_path}")
|
24 |
+
return None
|
25 |
+
try:
|
26 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
27 |
+
data = json.load(f)
|
28 |
+
if isinstance(data, list):
|
29 |
+
return data
|
30 |
+
else:
|
31 |
+
print(f"[错误] 文件格式不正确,预期为列表: {file_path}")
|
32 |
+
return None
|
33 |
+
except json.JSONDecodeError:
|
34 |
+
print(f"[错误] JSON解码失败: {file_path}")
|
35 |
+
return None
|
36 |
+
except Exception as e:
|
37 |
+
print(f"[错误] 加载文件时发生未知错误 {file_path}: {str(e)}")
|
38 |
+
return None
|
39 |
+
|
40 |
+
def generate_unique_id(base_id, existing_ids_set):
|
41 |
+
"""生成唯一的 ID (base_id_upd, base_id_upd_1, ...)"""
|
42 |
+
potential_id = f"{str(base_id)}_upd"
|
43 |
+
counter = 1
|
44 |
+
while potential_id in existing_ids_set:
|
45 |
+
potential_id = f"{str(base_id)}_upd_{counter}"
|
46 |
+
counter += 1
|
47 |
+
return potential_id
|
48 |
+
|
49 |
+
# --- 核心处理逻辑函数 ---
|
50 |
+
def process_record(item_r, existing_message_ids, existing_todo_contents, existing_message_id_to_record, all_known_message_ids, stats):
|
51 |
+
"""
|
52 |
+
处理来自 result1.json 的单条记录,根据规则决定操作。
|
53 |
+
返回: 要添加到 compare.json 的记录 (字典) 或 None。
|
54 |
+
同时更新 stats 字典和 all_known_message_ids 集合。
|
55 |
+
"""
|
56 |
+
record_to_save = None # 初始化返回值
|
57 |
+
|
58 |
+
try:
|
59 |
+
r_message_id_str = str(item_r['message_id'])
|
60 |
+
r_todo_content_str = str(item_r['todo_content'])
|
61 |
+
except KeyError as e:
|
62 |
+
print(f"[警告] result1.json 中的记录缺少键 {e},已跳过: {item_r}")
|
63 |
+
stats['skipped_missing_keys'] += 1
|
64 |
+
return None
|
65 |
+
except TypeError as e:
|
66 |
+
print(f"[警告] result1.json 中的记录键值类型错误 {e},已跳过: {item_r}")
|
67 |
+
stats['skipped_missing_keys'] += 1
|
68 |
+
return None
|
69 |
+
|
70 |
+
if r_message_id_str not in existing_message_ids:
|
71 |
+
# --- 情况 1: 新 message_id ---
|
72 |
+
if r_todo_content_str not in existing_todo_contents:
|
73 |
+
# 1.1: 新 todo_content -> 保存
|
74 |
+
record_to_save = item_r
|
75 |
+
all_known_message_ids.add(r_message_id_str) # 追踪新 ID
|
76 |
+
stats['saved_new_id_new_content'] += 1
|
77 |
+
else:
|
78 |
+
# 1.2: 已存在 todo_content -> 打印
|
79 |
+
print("-" * 30)
|
80 |
+
print(f"打印 (新 message_id: {r_message_id_str}, 但 todo_content 已存在):")
|
81 |
+
print(json.dumps(item_r, indent=2, ensure_ascii=False, default=convert_datetime))
|
82 |
+
print("-" * 30)
|
83 |
+
stats['printed_new_id_existing_content'] += 1
|
84 |
+
else:
|
85 |
+
# --- 情况 2: 已存在 message_id ---
|
86 |
+
record_e = existing_message_id_to_record.get(r_message_id_str) # 获取现有记录
|
87 |
+
if record_e is None:
|
88 |
+
# 理论上不应发生,因为 ID 在 existing_message_ids 中
|
89 |
+
print(f"[警告] ID {r_message_id_str} 在集合中但在字典中找不到?跳过。")
|
90 |
+
stats['skipped_internal_error'] = stats.get('skipped_internal_error', 0) + 1 # 新增统计
|
91 |
+
return None
|
92 |
+
|
93 |
+
e_todo_content_str = str(record_e.get('todo_content', '')) # 安全获取
|
94 |
+
|
95 |
+
if r_todo_content_str != e_todo_content_str:
|
96 |
+
# 2.1: todo_content 不同 -> 修改 ID 并保存
|
97 |
+
new_unique_id = generate_unique_id(r_message_id_str, all_known_message_ids)
|
98 |
+
# all_known_message_ids.add(new_unique_id) # 追踪新生成的 ID
|
99 |
+
|
100 |
+
modified_item_r = copy.deepcopy(item_r)
|
101 |
+
# modified_item_r['message_id'] = new_unique_id
|
102 |
+
record_to_save = modified_item_r
|
103 |
+
stats['saved_modified_id_diff_content'] += 1
|
104 |
+
else:
|
105 |
+
# 2.2: todo_content 相同 -> 打印
|
106 |
+
print("-" * 30)
|
107 |
+
print(f"打印 (message_id: {r_message_id_str} 已存在, todo_content 相同):")
|
108 |
+
print("来自 result1.json:")
|
109 |
+
print(json.dumps(item_r, indent=2, ensure_ascii=False, default=convert_datetime))
|
110 |
+
print("-" * 30)
|
111 |
+
stats['printed_existing_id_same_content'] += 1
|
112 |
+
|
113 |
+
return record_to_save
|
114 |
+
|
115 |
+
# --- 主函数 ---
|
116 |
+
def compare_and_generate_updates():
|
117 |
+
"""主函数:加载数据、处理、保存和打印统计信息"""
|
118 |
+
paths = get_paths()
|
119 |
+
data_dir = paths['data_dir']
|
120 |
+
|
121 |
+
# +++ 新增: 定义 compare_output_file 路径 +++
|
122 |
+
compare_output_file = os.path.join(data_dir, "compare.json") # 明确输出路径
|
123 |
+
|
124 |
+
# 1. 主动触发数据导出流程,获取导出的 JSON 文件路径
|
125 |
+
extracted_list_path = export_todolist_to_json() # 返回 todolist_export.json 的路径
|
126 |
+
if not extracted_list_path or not os.path.exists(extracted_list_path):
|
127 |
+
print("[错误] 导出 todolist 数据失败,流程终止。")
|
128 |
+
return
|
129 |
+
|
130 |
+
# 2. 主动触发消息处理流程,生成 result1.json
|
131 |
+
result1_path = main() # 返回 result1.json 的路径
|
132 |
+
if not result1_path or not os.path.exists(result1_path):
|
133 |
+
print("[错误] 生成 result1.json 失败,流程终止。")
|
134 |
+
return
|
135 |
+
|
136 |
+
# 3. 加载数据
|
137 |
+
result1_data = load_json_data(result1_path)
|
138 |
+
extracted_data = load_json_data(extracted_list_path)
|
139 |
+
if result1_data is None or extracted_data is None:
|
140 |
+
print("[错误] 数据加载失败,流程终止。")
|
141 |
+
return
|
142 |
+
|
143 |
+
# --- 创建查找结构 ---
|
144 |
+
try:
|
145 |
+
existing_message_ids = {str(item['message_id']) for item in extracted_data if 'message_id' in item}
|
146 |
+
existing_message_id_to_record = {str(item['message_id']): item for item in extracted_data if 'message_id' in item}
|
147 |
+
existing_todo_contents = {str(item['todo_content']) for item in extracted_data if 'todo_content' in item}
|
148 |
+
except (KeyError, TypeError) as e:
|
149 |
+
print(f"[错误] extracted_list.json 文件处理失败: {e}。请检查文件内容和格式。")
|
150 |
+
return
|
151 |
+
|
152 |
+
# --- 初始化 ---
|
153 |
+
records_for_compare_json = []
|
154 |
+
all_known_message_ids = set(existing_message_ids)
|
155 |
+
stats = { # 使用字典来存储统计数据
|
156 |
+
'processed': 0,
|
157 |
+
'skipped_missing_keys': 0,
|
158 |
+
'saved_new_id_new_content': 0,
|
159 |
+
'printed_new_id_existing_content': 0,
|
160 |
+
'saved_modified_id_diff_content': 0,
|
161 |
+
'printed_existing_id_same_content': 0,
|
162 |
+
'skipped_internal_error': 0 # 用于 process_record 内部错误
|
163 |
+
}
|
164 |
+
|
165 |
+
print("[信息] 开始比较和处理数据...")
|
166 |
+
# --- 主循环 ---
|
167 |
+
for item_r in result1_data:
|
168 |
+
stats['processed'] += 1
|
169 |
+
record_to_save = process_record(
|
170 |
+
item_r,
|
171 |
+
existing_message_ids,
|
172 |
+
existing_todo_contents,
|
173 |
+
existing_message_id_to_record,
|
174 |
+
all_known_message_ids,
|
175 |
+
stats # 传递 stats 字典用于更新
|
176 |
+
)
|
177 |
+
if record_to_save is not None:
|
178 |
+
records_for_compare_json.append(record_to_save)
|
179 |
+
|
180 |
+
# --- 写入文件 ---
|
181 |
+
try:
|
182 |
+
with open(compare_output_file, 'w', encoding='utf-8') as f:
|
183 |
+
json.dump(records_for_compare_json, f, indent=2, ensure_ascii=False, default=convert_datetime)
|
184 |
+
print(f"✅ 成功生成 compare.json 文件,包含 {len(records_for_compare_json)} 条记录。")
|
185 |
+
except Exception as e:
|
186 |
+
print(f"[错误] 写入 compare.json 文件时发生错误: {str(e)}")
|
187 |
+
|
188 |
+
# --- 打印统计 ---
|
189 |
+
print("=" * 40)
|
190 |
+
print("处理统计:")
|
191 |
+
print(f" 处理 result1.json 记录总数: {stats['processed']}")
|
192 |
+
print(f" 跳过 (缺少关键键或类型错误): {stats['skipped_missing_keys']}")
|
193 |
+
if stats['skipped_internal_error'] > 0:
|
194 |
+
print(f" 跳过 (内部逻辑错误): {stats['skipped_internal_error']}")
|
195 |
+
print("-" * 20)
|
196 |
+
print(" 写入 compare.json:")
|
197 |
+
print(f" - 新 message_id, 新 todo_content: {stats['saved_new_id_new_content']}")
|
198 |
+
print(f" - 修改后 message_id (因冲突且 todo_content 不同): {stats['saved_modified_id_diff_content']}")
|
199 |
+
print("-" * 20)
|
200 |
+
print(" 打印到控制台:")
|
201 |
+
print(f" - 新 message_id, 但 todo_content 已存在: {stats['printed_new_id_existing_content']}")
|
202 |
+
print(f" - message_id 已存在, todo_content 相同: {stats['printed_existing_id_same_content']}")
|
203 |
+
print("=" * 40)
|
204 |
+
|
205 |
+
|
206 |
+
return records_for_compare_json
|
207 |
+
|
208 |
+
if __name__ == "__main__":
|
209 |
+
compare_and_generate_updates() # 触发整个流程
|
LLM/todogen_LLM/config_loader.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# config_loader.py
|
2 |
+
import yaml
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
def load_config():
|
6 |
+
config_path = Path(__file__).parent / "todogen_LLM_config.yaml"
|
7 |
+
with open(config_path, 'r', encoding='utf-8') as f:
|
8 |
+
return yaml.safe_load(f)
|
9 |
+
|
10 |
+
CONFIG = load_config()
|
11 |
+
|
12 |
+
def get_mysql_config():
|
13 |
+
return {
|
14 |
+
**CONFIG['mysql'],
|
15 |
+
'ssl_ca': str(Path(__file__).parent / CONFIG['mysql']['ssl_ca'])
|
16 |
+
}
|
17 |
+
|
18 |
+
def get_openai_config():
|
19 |
+
return CONFIG['openai']
|
20 |
+
|
21 |
+
def get_paths():
|
22 |
+
config = load_config()
|
23 |
+
base = Path(__file__).resolve().parent # 定位到todogen_LLM目录
|
24 |
+
return {
|
25 |
+
'base_dir': base,
|
26 |
+
'data_dir': base / config['paths']['data_dir'],
|
27 |
+
'logging_dir': base / config['paths']['logging_dir']
|
28 |
+
}
|
29 |
+
|
30 |
+
def get_processing_config():
|
31 |
+
return CONFIG['processing']
|
32 |
+
|
33 |
+
def get_defaults_config():
|
34 |
+
return CONFIG['defaults']
|
LLM/todogen_LLM/database_of_messages.py
ADDED
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# database_of_messages.py
|
2 |
+
from config_loader import get_paths
|
3 |
+
import mysql.connector
|
4 |
+
from datetime import datetime
|
5 |
+
from pathlib import Path
|
6 |
+
import sys
|
7 |
+
from config_loader import get_mysql_config, get_defaults_config, get_processing_config
|
8 |
+
import concurrent.futures # 必须添加的模块导入
|
9 |
+
from concurrent.futures import ThreadPoolExecutor # 关键修复导入
|
10 |
+
from tqdm import tqdm
|
11 |
+
import logging # 导入 logging
|
12 |
+
|
13 |
+
# 获取 logger 实例
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
+
|
16 |
+
sys.stdout.reconfigure(encoding='utf-8')
|
17 |
+
|
18 |
+
def process_row(args):
|
19 |
+
"""多线程处理单行数据"""
|
20 |
+
columns, row = args
|
21 |
+
row_dict = {}
|
22 |
+
for col_name, value in zip(columns, row):
|
23 |
+
if isinstance(value, datetime):
|
24 |
+
row_dict[col_name] = value.isoformat()
|
25 |
+
elif isinstance(value, int):
|
26 |
+
row_dict[col_name] = str(value)
|
27 |
+
else:
|
28 |
+
row_dict[col_name] = str(value)
|
29 |
+
message_id = str(row_dict.get("message_id", ""))
|
30 |
+
return (message_id, row_dict)
|
31 |
+
|
32 |
+
def async_main() -> dict:
|
33 |
+
"""带进度条和多线程的版本 - 添加日志记录"""
|
34 |
+
|
35 |
+
paths = get_paths()
|
36 |
+
(paths['data_dir']).mkdir(exist_ok=True) # 确保输出目录存在
|
37 |
+
|
38 |
+
logger.info("开始执行 async_main 获取所有消息...")
|
39 |
+
db_config = get_mysql_config() # 从配置加载数据库连接信息
|
40 |
+
processing_config = get_processing_config() # 从配置加载处理参数
|
41 |
+
db_fetch_workers = processing_config.get('db_fetch_workers', 4) # 获取数据库读取并发度,默认4
|
42 |
+
|
43 |
+
conn = None # 初始化 conn
|
44 |
+
cursor = None # 初始化 cursor
|
45 |
+
try:
|
46 |
+
logger.debug("尝试连接数据库 (async_main)...")
|
47 |
+
conn = mysql.connector.connect(
|
48 |
+
user=db_config['user'],
|
49 |
+
password=db_config['password'],
|
50 |
+
host=db_config['host'],
|
51 |
+
port=db_config['port'],
|
52 |
+
database=db_config['database'],
|
53 |
+
ssl_ca=db_config['ssl_ca'],
|
54 |
+
ssl_disabled=False
|
55 |
+
)
|
56 |
+
logger.info("✅ 数据库连接成功 (async_main)")
|
57 |
+
cursor = conn.cursor()
|
58 |
+
logger.info("开始执行查询: SELECT * FROM Messages")
|
59 |
+
cursor.execute("SELECT * FROM Messages")
|
60 |
+
result = cursor.fetchall()
|
61 |
+
columns = [desc[0] for desc in cursor.description]
|
62 |
+
logger.info(f"数据库查询完成,获取到 {len(result)} 条原始记录。")
|
63 |
+
|
64 |
+
data = {}
|
65 |
+
# 明确设置 max_workers
|
66 |
+
with ThreadPoolExecutor(max_workers=db_fetch_workers) as executor, \
|
67 |
+
tqdm(total=len(result), desc="数据获取进度") as pbar:
|
68 |
+
|
69 |
+
future_to_row = {
|
70 |
+
executor.submit(process_row, (columns, row)): row
|
71 |
+
for row in result
|
72 |
+
}
|
73 |
+
|
74 |
+
processed_count = 0
|
75 |
+
for future in concurrent.futures.as_completed(future_to_row):
|
76 |
+
try:
|
77 |
+
message_id, row_dict = future.result()
|
78 |
+
data[message_id] = row_dict
|
79 |
+
processed_count += 1
|
80 |
+
except Exception as exc:
|
81 |
+
logger.error(f"处理单行数据时出错: {exc}", exc_info=True)
|
82 |
+
pbar.update(1)
|
83 |
+
logger.info(f"数据行处理完成,成功处理 {processed_count}/{len(result)} 行。")
|
84 |
+
return data
|
85 |
+
|
86 |
+
except mysql.connector.Error as e:
|
87 |
+
# 记录数据库操作错误
|
88 |
+
logger.error(f"❌ 数据库操作错误 (async_main): {e}", exc_info=True)
|
89 |
+
return {}
|
90 |
+
except Exception as e:
|
91 |
+
# 记录其他可能的错误
|
92 |
+
logger.exception(f"❌ async_main 中发生未知错误: {e}") # 使用 exception 自动记录 traceback
|
93 |
+
return {}
|
94 |
+
finally:
|
95 |
+
if cursor:
|
96 |
+
cursor.close()
|
97 |
+
if conn and conn.is_connected():
|
98 |
+
conn.close()
|
99 |
+
logger.info("ℹ️ 数据库连接已关闭 (async_main)。")
|
100 |
+
|
101 |
+
"""
|
102 |
+
后面部分即为上传数据部分,切不可搞错
|
103 |
+
|
104 |
+
"""
|
105 |
+
def upload_to_todolist(data: dict):
|
106 |
+
"""将转换后的JSON数据上传到todolist表 (使用 executemany 批量插入) - 添加日志记录"""
|
107 |
+
logger.info(f"开始执行 upload_to_todolist,准备处理 {len(data)} 条输入数据...")
|
108 |
+
db_config = get_mysql_config() # 直接使用从配置加载的字典
|
109 |
+
|
110 |
+
rows_to_insert = [] # 用于收集待插入的数据行
|
111 |
+
skipped_count = 0
|
112 |
+
|
113 |
+
# 获取默认值配置
|
114 |
+
defaults_config = get_defaults_config()
|
115 |
+
default_todo_status = defaults_config.get('todo_status', 'doing')
|
116 |
+
default_urgency_status = defaults_config.get('urgency_status', 'unimportant')
|
117 |
+
|
118 |
+
logger.debug("开始遍历和转换数据用于批量插入...")
|
119 |
+
for item_key, item in data.items(): # 使用 item_key (可能是 message_id) 记录日志
|
120 |
+
try:
|
121 |
+
# 必填字段处理
|
122 |
+
# 注意:todolist 表结构似乎没有 todo_id 列,插入语句中也没有,这里假设它是 message_id 或其他需要处理的 ID
|
123 |
+
# 假设 user_id 是从 item 中获取���如果不是,需要调整来源
|
124 |
+
# user_id = int(item["user_id"])
|
125 |
+
user_id = int(item.get("user_id", 0)) # 示例:如果 item 中没有 user_id,则使用默认值 0,需要确认
|
126 |
+
|
127 |
+
# 检查 date 字段是否存在且有效
|
128 |
+
date_str = item.get("date")
|
129 |
+
if not date_str:
|
130 |
+
raise ValueError("缺少 'date' 字段")
|
131 |
+
start_time = datetime.fromisoformat(date_str.replace("T", " "))
|
132 |
+
|
133 |
+
todo_content = item["todo_content"]
|
134 |
+
|
135 |
+
# 选填字段处理
|
136 |
+
end_time_str = item.get("end_time")
|
137 |
+
end_time = datetime.fromisoformat(end_time_str) if end_time_str else None
|
138 |
+
|
139 |
+
location = item.get("location", "")[:255] # 截断超长内容
|
140 |
+
|
141 |
+
# 使用从配置加载的默认值
|
142 |
+
todo_status = item.get("todo_statu", default_todo_status)
|
143 |
+
urgency_status = item.get("urgency_statu", default_urgency_status)
|
144 |
+
|
145 |
+
# 将有效数据添加到待插入列表
|
146 |
+
rows_to_insert.append((
|
147 |
+
# 注意:根据 INSERT 语句调整这里的字段顺序和来源
|
148 |
+
user_id, # user_id
|
149 |
+
start_time, # start_time
|
150 |
+
end_time, # end_time
|
151 |
+
location, # location
|
152 |
+
todo_content, # todo_content
|
153 |
+
todo_status, # todo_statu
|
154 |
+
urgency_status # urgency_statu
|
155 |
+
))
|
156 |
+
|
157 |
+
except (KeyError, ValueError, TypeError) as e: # 捕捉更多可能的错误
|
158 |
+
skipped_count += 1
|
159 |
+
# 替换 print 为 logger.warning
|
160 |
+
logger.warning(f"⚠️ 跳过无效数据 (来自 key {item_key}, 原因: {str(e)}) - 原始数据: {item}")
|
161 |
+
continue
|
162 |
+
|
163 |
+
logger.debug("数据转换完成。")
|
164 |
+
|
165 |
+
if skipped_count > 0:
|
166 |
+
# 替换 print 为 logger.info
|
167 |
+
logger.info(f"ℹ️ 共跳过 {skipped_count} 条无效数据。")
|
168 |
+
|
169 |
+
if not rows_to_insert:
|
170 |
+
# 替换 print 为 logger.info
|
171 |
+
logger.info("ℹ️ 没有有效数据需要插入到数据库。")
|
172 |
+
return
|
173 |
+
|
174 |
+
logger.info(f"准备将 {len(rows_to_insert)} 条有效数据批量插入数据库...")
|
175 |
+
cnx = None
|
176 |
+
cursor = None
|
177 |
+
try:
|
178 |
+
logger.debug("尝试连接数据库 (upload_to_todolist)...")
|
179 |
+
cnx = mysql.connector.connect(**db_config)
|
180 |
+
logger.info("✅ 数据库连接成功 (upload_to_todolist)")
|
181 |
+
cursor = cnx.cursor()
|
182 |
+
|
183 |
+
# 预处理插入语句 (保持不变)
|
184 |
+
insert_query = """
|
185 |
+
INSERT INTO todolist (
|
186 |
+
user_id,
|
187 |
+
start_time,
|
188 |
+
end_time,
|
189 |
+
location,
|
190 |
+
todo_content,
|
191 |
+
todo_statu,
|
192 |
+
urgency_statu
|
193 |
+
) VALUES (%s, %s, %s, %s, %s, %s, %s)
|
194 |
+
"""
|
195 |
+
|
196 |
+
logger.info("开始执行批量插入 (executemany)...")
|
197 |
+
cursor.executemany(insert_query, rows_to_insert)
|
198 |
+
rowcount = cursor.rowcount # 获取影响的行数
|
199 |
+
logger.info("批量插入执行完毕,尝试提交事务...")
|
200 |
+
cnx.commit()
|
201 |
+
logger.info(f"✅ 成功批量插入 {rowcount} 条记录到 todolist 表")
|
202 |
+
|
203 |
+
except mysql.connector.Error as err:
|
204 |
+
# 替换 print 为 logger.error
|
205 |
+
logger.error(f"❌ 数据库批量插入错误: {err}", exc_info=True)
|
206 |
+
logger.warning("尝试回滚数据库事务...")
|
207 |
+
try:
|
208 |
+
if cnx and cnx.is_connected():
|
209 |
+
cnx.rollback()
|
210 |
+
logger.warning("数据库事务已回滚。")
|
211 |
+
except Exception as rollback_err:
|
212 |
+
logger.error(f"尝试回滚事务时出错: {rollback_err}", exc_info=True)
|
213 |
+
except Exception as e:
|
214 |
+
logger.exception(f"❌ upload_to_todolist 中发生未知错误: {e}")
|
215 |
+
finally:
|
216 |
+
if cursor:
|
217 |
+
cursor.close()
|
218 |
+
if cnx and cnx.is_connected():
|
219 |
+
cnx.close()
|
220 |
+
# 替换 print 为 logger.info
|
221 |
+
logger.info("ℹ️ 数据库连接已关闭 (upload_to_todolist)。")
|
LLM/todogen_LLM/export_todolist.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# export_todolist.py
|
2 |
+
import json
|
3 |
+
from pathlib import Path
|
4 |
+
import mysql.connector
|
5 |
+
from config_loader import get_mysql_config, get_paths
|
6 |
+
from datetime import datetime # 新增导入
|
7 |
+
import sys
|
8 |
+
|
9 |
+
sys.stdout.reconfigure(encoding='utf-8')
|
10 |
+
|
11 |
+
def convert_datetime(obj):
|
12 |
+
"""自定义JSON序列化处理器"""
|
13 |
+
if isinstance(obj, datetime):
|
14 |
+
return obj.isoformat()
|
15 |
+
raise TypeError(f"Type {type(obj)} not serializable")
|
16 |
+
|
17 |
+
def export_todolist_to_json():
|
18 |
+
"""导出todolist表数据到JSON文件"""
|
19 |
+
try:
|
20 |
+
# 获取配置
|
21 |
+
db_config = get_mysql_config()
|
22 |
+
paths = get_paths()
|
23 |
+
|
24 |
+
# 建立数据库连接
|
25 |
+
conn = mysql.connector.connect(**db_config)
|
26 |
+
cursor = conn.cursor(dictionary=True)
|
27 |
+
|
28 |
+
# 执行查询
|
29 |
+
cursor.execute("SELECT * FROM todolist")
|
30 |
+
results = cursor.fetchall()
|
31 |
+
|
32 |
+
# 创建输出目录
|
33 |
+
output_dir = Path(paths['data_dir'])
|
34 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
35 |
+
|
36 |
+
# 保存文件(增加cls参数)
|
37 |
+
output_path = output_dir / "todolist_export.json"
|
38 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
39 |
+
json.dump(results, f,
|
40 |
+
indent=2,
|
41 |
+
ensure_ascii=False,
|
42 |
+
default=convert_datetime) # 关键修改
|
43 |
+
|
44 |
+
print(f"✅ 成功导出 {len(results)} 条记录到 {output_path}")
|
45 |
+
|
46 |
+
return str(output_path)
|
47 |
+
|
48 |
+
except mysql.connector.Error as err:
|
49 |
+
print(f"[错误] 数据库错误: {err}") # 移除了Unicode符号
|
50 |
+
except Exception as e:
|
51 |
+
print(f"[错误] 发生异常: {str(e)}")
|
52 |
+
return None
|
53 |
+
finally:
|
54 |
+
if 'conn' in locals() and conn.is_connected():
|
55 |
+
cursor.close()
|
56 |
+
conn.close()
|
57 |
+
|
58 |
+
if __name__ == "__main__":
|
59 |
+
export_todolist_to_json()
|
LLM/todogen_LLM/filter_message_list.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import mysql.connector
|
2 |
+
from pathlib import Path
|
3 |
+
import sys
|
4 |
+
from config_loader import get_mysql_config
|
5 |
+
|
6 |
+
sys.stdout.reconfigure(encoding='utf-8')
|
7 |
+
|
8 |
+
def get_message_ids():
|
9 |
+
current_dir = Path(__file__).parent.absolute()
|
10 |
+
# ssl_ca_path = current_dir / "DigiCertGlobalRootCA.crt.pem"
|
11 |
+
# message_ids = []
|
12 |
+
config = get_mysql_config()
|
13 |
+
|
14 |
+
try:
|
15 |
+
# 建立数据库连接
|
16 |
+
cnx = mysql.connector.connect(
|
17 |
+
user=config['user'],
|
18 |
+
password=config['password'],
|
19 |
+
host=config['host'],
|
20 |
+
port=config['port'],
|
21 |
+
database=config['database'],
|
22 |
+
ssl_ca=config['ssl_ca'],
|
23 |
+
ssl_disabled=False
|
24 |
+
)
|
25 |
+
|
26 |
+
cursor = cnx.cursor()
|
27 |
+
|
28 |
+
# 执行查询
|
29 |
+
cursor.execute("SELECT message_id FROM filter_message")
|
30 |
+
results = cursor.fetchall()
|
31 |
+
|
32 |
+
# 提取为纯数字列表
|
33 |
+
message_ids = [row[0] for row in results]
|
34 |
+
|
35 |
+
cursor.close()
|
36 |
+
cnx.close()
|
37 |
+
print(f"成功获取 {len(message_ids)} 条message_id")
|
38 |
+
|
39 |
+
except mysql.connector.Error as err:
|
40 |
+
print(f"数据库错误: {err}")
|
41 |
+
except Exception as e:
|
42 |
+
print(f"发生异常: {str(e)}")
|
43 |
+
|
44 |
+
return message_ids
|
45 |
+
|
46 |
+
if __name__ == '__main__':
|
47 |
+
id_list = get_message_ids()
|
48 |
+
print("\n提取结果:")
|
49 |
+
print(id_list)
|
LLM/todogen_LLM/filter_useful_data_to_dict.py
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# filter_useful_data_to_dict.py
|
2 |
+
from database_of_messages import async_main # 确保导入的是同步函数
|
3 |
+
from typing import List, Dict
|
4 |
+
import re
|
5 |
+
import json
|
6 |
+
import datetime
|
7 |
+
from filter_message_list import get_message_ids
|
8 |
+
from pathlib import Path # 确保导入 Path
|
9 |
+
import logging
|
10 |
+
from config_loader import get_paths # 确保导入 get_paths
|
11 |
+
|
12 |
+
# --- 在顶层获取 logger 实例 ---
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
# 移除所有异步装饰器和await调用
|
16 |
+
def fetch_target_messages(target_ids: List[int]) -> Dict[str, Dict]:
|
17 |
+
"""核心函数1:获取指定message_id的原始数据"""
|
18 |
+
logger.info("🛜 正在获取目标消息原始数据...")
|
19 |
+
|
20 |
+
# 直接调用 async_main,不传递参数
|
21 |
+
all_data = async_main()
|
22 |
+
|
23 |
+
str_ids = {str(msg_id) for msg_id in target_ids}
|
24 |
+
filtered_data = {k: v for k, v in all_data.items() if k in str_ids}
|
25 |
+
|
26 |
+
logger.info(f"✅ 找到 {len(filtered_data)}/{len(target_ids)} 条目标消息")
|
27 |
+
return filtered_data
|
28 |
+
|
29 |
+
def format_messages(raw_data: Dict[str, Dict]) -> Dict[str, str]:
|
30 |
+
"""核心函数2:格式化消息为指定字符串"""
|
31 |
+
logger.info("🔄 正在进行数据格式化...")
|
32 |
+
|
33 |
+
formatted = {}
|
34 |
+
for msg_id, details in raw_data.items():
|
35 |
+
# 修正:date字段在database_of_messages中已被转换为字符串
|
36 |
+
date = details.get("date", "null")
|
37 |
+
if isinstance(date, str): # 类型检查改为字符串
|
38 |
+
try:
|
39 |
+
# 转换为datetime对象确保格式有效
|
40 |
+
parsed_date = datetime.datetime.fromisoformat(date)
|
41 |
+
date = parsed_date.strftime("%Y-%m-%dT%H:%M:%S")
|
42 |
+
except ValueError:
|
43 |
+
date = "null"
|
44 |
+
|
45 |
+
sender = details.get("sender", "null").strip("'‘'")
|
46 |
+
content = details.get("content", "null")
|
47 |
+
|
48 |
+
# 判断sender是否为纯数字
|
49 |
+
if sender.isdigit():
|
50 |
+
formatted_str = f"开始日期为{date},{content}"
|
51 |
+
else:
|
52 |
+
# 确保外部双引号,内部单引号
|
53 |
+
formatted_str = f"开始日期为{date},内容源于'{sender}',{content}"
|
54 |
+
|
55 |
+
formatted[msg_id] = formatted_str
|
56 |
+
|
57 |
+
logger.info("🎉 格式化完成")
|
58 |
+
return formatted
|
59 |
+
|
60 |
+
def validate_format(formatted_data: Dict[str, str], target_ids: List[int]) -> bool:
|
61 |
+
for msg_id, content in formatted_data.items():
|
62 |
+
if "内容源于" in content and re.search(r"内容源于'(\d+)'", content):
|
63 |
+
raise ValueError(f"❌ 值 {msg_id} 包含数字来源标识")
|
64 |
+
return True
|
65 |
+
|
66 |
+
# --- 恢复 get_formatted_data 的标准保存逻辑 ---
|
67 |
+
# 移除 output_override_path 参数
|
68 |
+
def get_formatted_data(target_ids: List[int]) -> Dict[str, str]:
|
69 |
+
"""
|
70 |
+
供其他模块调用的同步接口 - 保存结果到配置文件指定的目录。
|
71 |
+
"""
|
72 |
+
|
73 |
+
paths = get_paths()
|
74 |
+
|
75 |
+
# 替换原有路径构造逻辑
|
76 |
+
output_file_path = paths['data_dir'] / "filter_data.json"
|
77 |
+
|
78 |
+
# 确保目录存在
|
79 |
+
paths['data_dir'].mkdir(parents=True, exist_ok=True)
|
80 |
+
|
81 |
+
logger.info("开始执行 get_formatted_data...")
|
82 |
+
raw_data = fetch_target_messages(target_ids)
|
83 |
+
formatted_data = format_messages(raw_data)
|
84 |
+
try:
|
85 |
+
validate_format(formatted_data, target_ids)
|
86 |
+
logger.info("数据格式验证通过。")
|
87 |
+
except ValueError as ve:
|
88 |
+
logger.error(f"❌ 数据格式验证失败: {ve}")
|
89 |
+
logger.warning("数据格式验证失败,但仍将尝试保存当前格式化数据。")
|
90 |
+
|
91 |
+
# === 恢复:保存到配置文件指定的目录 ===
|
92 |
+
logger.info("准备将格式化数据保存到配置文件指定的目录...")
|
93 |
+
output_filename = "filter_data.json"
|
94 |
+
try:
|
95 |
+
paths_config = get_paths()
|
96 |
+
# 从配置读取 data_dir (值为 "output")
|
97 |
+
output_dir_rel = paths_config.get('data_dir', 'output') # 使用 config 值
|
98 |
+
|
99 |
+
# 获取当前脚本所在的目录
|
100 |
+
script_dir = Path(__file__).resolve().parent
|
101 |
+
|
102 |
+
# 构建正确的输出目录绝对路径 (todogen_LLM/output)
|
103 |
+
output_dir_abs = script_dir / output_dir_rel
|
104 |
+
output_file_path = output_dir_abs / output_filename
|
105 |
+
|
106 |
+
# 确保目标目录存在
|
107 |
+
output_dir_abs.mkdir(parents=True, exist_ok=True)
|
108 |
+
|
109 |
+
with open(output_file_path, 'w', encoding='utf-8') as f:
|
110 |
+
json.dump(formatted_data, f, ensure_ascii=False, indent=2)
|
111 |
+
logger.info(f"✅ 格式化数据已成功保存至: {output_file_path}")
|
112 |
+
except KeyError as e:
|
113 |
+
logger.error(f"❌ config.yaml 中缺少路径配置项 'data_dir': {e}")
|
114 |
+
except Exception as e:
|
115 |
+
logger.error(f"❌ 保存 {output_filename} 时发生错误: {e}", exc_info=True)
|
116 |
+
# ===================================================
|
117 |
+
|
118 |
+
logger.info("get_formatted_data 执行完毕。")
|
119 |
+
return formatted_data
|
120 |
+
|
121 |
+
# --- 恢复 main 函数的标准调用 ---
|
122 |
+
# 移除 output_override_path 参数
|
123 |
+
def main(target_ids: List[int]):
|
124 |
+
"""主函数,用于直接运行脚本。"""
|
125 |
+
logger.info("执行 main 函数 (用于直接运行脚本)...")
|
126 |
+
# 调用 get_formatted_data (它现在总是保存到配置指定的目录)
|
127 |
+
formatted_data = get_formatted_data(target_ids)
|
128 |
+
if formatted_data:
|
129 |
+
logger.info("main 函数执行完成,格式化数据已生成并尝试保存。")
|
130 |
+
else:
|
131 |
+
logger.warning("main 函数执行完成,但 get_formatted_data 未返回有效数据或保存失败。")
|
132 |
+
|
133 |
+
def print_results(raw_data: Dict[str, Dict], formatted_data: Dict[str, str]):
|
134 |
+
print("\n=== 原始数据 ===")
|
135 |
+
print(json.dumps(raw_data, ensure_ascii=False, indent=2))
|
136 |
+
print("\n=== 格式化数据 ===")
|
137 |
+
print(json.dumps(formatted_data, ensure_ascii=False, indent=2))
|
138 |
+
|
139 |
+
if __name__ == '__main__':
|
140 |
+
# === 确保日志也初始化 ===
|
141 |
+
# 注意:这里的 setup_logging() 会配置根 logger,
|
142 |
+
# 我们在顶层获取的 logger = logging.getLogger(__name__) 会继承这个配置
|
143 |
+
try:
|
144 |
+
from logger_config import setup_logging
|
145 |
+
setup_logging()
|
146 |
+
except ImportError:
|
147 |
+
# 如果 logger_config 导入失败,进行基本配置
|
148 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
149 |
+
# logger = logging.getLogger(__name__) # 这行不再需要在这里定义
|
150 |
+
logger.warning("logger_config 未找到,使用基本日志配置。")
|
151 |
+
except Exception as log_setup_err:
|
152 |
+
# logger = logging.getLogger(__name__) # 这行不再需要在这里定义
|
153 |
+
logger.error(f"日志设置失败: {log_setup_err}", exc_info=True)
|
154 |
+
# 仍然进行基本配置作为后备
|
155 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
156 |
+
# =============================================
|
157 |
+
|
158 |
+
# 现在 logger 变量肯定存在了
|
159 |
+
logger.info(f"脚本 {__file__} 作为主程序运行...")
|
160 |
+
|
161 |
+
TARGET_IDS = get_message_ids()
|
162 |
+
|
163 |
+
main(TARGET_IDS)
|
LLM/todogen_LLM/jiaoben.py
ADDED
@@ -0,0 +1,318 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from pathlib import Path
|
3 |
+
import sys
|
4 |
+
from datetime import timedelta
|
5 |
+
from dateutil.parser import parse
|
6 |
+
import mysql.connector
|
7 |
+
from mysql.connector import Error
|
8 |
+
from config_loader import get_mysql_config, get_paths
|
9 |
+
# 修改导入:导入比较函数,移除旧的合并函数
|
10 |
+
from compare_data import compare_and_generate_updates
|
11 |
+
import logging # 导入 logging
|
12 |
+
from logger_config import setup_logging # 导入 setup_logging
|
13 |
+
import traceback # 导入 traceback
|
14 |
+
|
15 |
+
# === 日志初始化 ===
|
16 |
+
setup_logging() # 在所有代码执行前调用
|
17 |
+
logger = logging.getLogger(__name__) # 获取 logger 实例
|
18 |
+
# =================
|
19 |
+
|
20 |
+
sys.stdout.reconfigure(encoding='utf-8')
|
21 |
+
|
22 |
+
def get_db_connection():
|
23 |
+
"""建立数据库连接"""
|
24 |
+
logger.debug("尝试建立数据库连接...")
|
25 |
+
try:
|
26 |
+
# current_dir = Path(__file__).parent.absolute()
|
27 |
+
# ssl_ca_path = current_dir / "DigiCertGlobalRootCA.crt.pem"
|
28 |
+
|
29 |
+
db_config = get_mysql_config()
|
30 |
+
|
31 |
+
connection = mysql.connector.connect(
|
32 |
+
user=db_config['user'],
|
33 |
+
password=db_config['password'],
|
34 |
+
host=db_config['host'],
|
35 |
+
port=db_config['port'],
|
36 |
+
database=db_config['database'],
|
37 |
+
ssl_ca=db_config['ssl_ca'],
|
38 |
+
ssl_disabled=False
|
39 |
+
)
|
40 |
+
logger.info("✅ 数据库连接成功")
|
41 |
+
return connection
|
42 |
+
except Error as e:
|
43 |
+
# 使用 logger.error 并包含异常信息
|
44 |
+
logger.error(f"❌ 数据库连接失败: {e}", exc_info=True)
|
45 |
+
return None
|
46 |
+
|
47 |
+
def process_end_time(item):
|
48 |
+
"""处理end_time字段:如果为null则设置为date加1小时"""
|
49 |
+
# 确保 date 和 end_time 存在
|
50 |
+
date_str = item.get("date")
|
51 |
+
end_time_val = item.get("end_time")
|
52 |
+
|
53 |
+
# 检查 end_time 是否为 None 或 "null"
|
54 |
+
if end_time_val is None or end_time_val == "null":
|
55 |
+
# 检查 date 是否有效
|
56 |
+
if date_str and date_str != "null":
|
57 |
+
try:
|
58 |
+
date_obj = parse(date_str)
|
59 |
+
end_time_obj = date_obj + timedelta(hours=1)
|
60 |
+
item["end_time"] = end_time_obj.isoformat()
|
61 |
+
except (ValueError, TypeError):
|
62 |
+
# 如果 date 解析失败,将 end_time 设为 "null"
|
63 |
+
item["end_time"] = "null"
|
64 |
+
else:
|
65 |
+
# 如果 date 无效,将 end_time 设为 "null"
|
66 |
+
item["end_time"] = "null"
|
67 |
+
# 如果 end_time 已有值,则不做处理
|
68 |
+
return item
|
69 |
+
|
70 |
+
|
71 |
+
def insert_to_database(data_list):
|
72 |
+
"""将处理后的数据插入到数据库"""
|
73 |
+
connection = get_db_connection()
|
74 |
+
if not connection:
|
75 |
+
# get_db_connection 内部已记录错误
|
76 |
+
return False
|
77 |
+
|
78 |
+
if not data_list:
|
79 |
+
logger.info("ℹ️ 没有数据需要插入数据库。")
|
80 |
+
return True # 没有数据也算成功
|
81 |
+
|
82 |
+
logger.info(f"准备将 {len(data_list)} 条记录插入数据库...")
|
83 |
+
try:
|
84 |
+
cursor = connection.cursor()
|
85 |
+
|
86 |
+
# 准备插入SQL - 在字段列表和 VALUES 中加入 message_id
|
87 |
+
insert_query = """
|
88 |
+
INSERT INTO todolist
|
89 |
+
(message_id, user_id, start_time, todo_content, urgency_statu, end_time, location)
|
90 |
+
VALUES (%s, %s, %s, %s, %s, %s, %s)
|
91 |
+
"""
|
92 |
+
|
93 |
+
# 准备数据 - 字段映射关系:
|
94 |
+
# message_id -> message_id <--- 新增
|
95 |
+
# user_id → user_id
|
96 |
+
# date → start_time
|
97 |
+
# todo_content → todo_content
|
98 |
+
# urgency → urgency_statu
|
99 |
+
# end_time → end_time
|
100 |
+
# location → location
|
101 |
+
records_to_insert = []
|
102 |
+
skipped_count = 0
|
103 |
+
for item in data_list:
|
104 |
+
try:
|
105 |
+
# 处理日期格式,添加更健壮的错误处理
|
106 |
+
start_time = parse(item["date"]).strftime('%Y-%m-%d %H:%M:%S') if item.get("date") and item["date"] != "null" else None
|
107 |
+
end_time = parse(item["end_time"]).strftime('%Y-%m-%d %H:%M:%S') if item.get("end_time") and item["end_time"] != "null" else None
|
108 |
+
|
109 |
+
# 确保关键字段存在且不为空
|
110 |
+
# 获取 message_id,假设它应该是整数,如果不是或者为空则设为 None 或其他默认值
|
111 |
+
message_id_str = item.get("message_id", "")
|
112 |
+
message_id = None
|
113 |
+
if message_id_str:
|
114 |
+
try:
|
115 |
+
# 假设 message_id 在数据库中是数字类型
|
116 |
+
# 如果 message_id 可能是非数字,需要调整这里的转换逻辑
|
117 |
+
# 或者直接作为字符串插入(如果数据库字段允许)
|
118 |
+
message_id = int(message_id_str)
|
119 |
+
except ValueError:
|
120 |
+
# 替换 print 为 logger.warning
|
121 |
+
logger.warning(f"⚠️ 跳过记录,message_id '{message_id_str}' 不是有效的整数: {item}")
|
122 |
+
skipped_count += 1
|
123 |
+
continue
|
124 |
+
|
125 |
+
user_id = item.get("user_id")
|
126 |
+
todo_content = item.get("todo_content")
|
127 |
+
urgency = item.get("urgency", "unimportant") # 提供默认值
|
128 |
+
location = item.get("location", "") # 提供默认值
|
129 |
+
|
130 |
+
# 增加对 message_id 的检查,如果它在数据库中是必需的
|
131 |
+
if location is None or todo_content is None: # 假设 message_id 也是必需的
|
132 |
+
# 替换 print 为 logger.warning
|
133 |
+
logger.warning(f"⚠️ 跳过记录,缺少 location, end_time 或 todo_content: {item}")
|
134 |
+
skipped_count += 1
|
135 |
+
continue
|
136 |
+
|
137 |
+
# 在 record 元组中加入 message_id
|
138 |
+
record = (
|
139 |
+
message_id, # message_id <--- 新增
|
140 |
+
user_id, # user_id
|
141 |
+
start_time, # start_time (可能为 None)
|
142 |
+
todo_content, # todo_content
|
143 |
+
urgency, # urgency_statu
|
144 |
+
end_time, # end_time (可能为 None)
|
145 |
+
location # location
|
146 |
+
)
|
147 |
+
|
148 |
+
records_to_insert.append(record)
|
149 |
+
|
150 |
+
except (ValueError, TypeError, KeyError) as e:
|
151 |
+
# 替换 print 为 logger.warning
|
152 |
+
logger.warning(f"⚠️ 处理记录时出错,已跳过: {item}, 错误: {e}")
|
153 |
+
skipped_count += 1
|
154 |
+
continue
|
155 |
+
|
156 |
+
|
157 |
+
if skipped_count > 0:
|
158 |
+
# 替换 print 为 logger.info
|
159 |
+
logger.info(f"ℹ️ 在准备插入数据库时跳过了 {skipped_count} 条记录。")
|
160 |
+
|
161 |
+
if not records_to_insert:
|
162 |
+
# 替换 print 为 logger.info
|
163 |
+
logger.info("ℹ️ 没有有效记录可供插入数据库。")
|
164 |
+
return True # 没有有效数据也算操作完成
|
165 |
+
|
166 |
+
# 执行批量插入
|
167 |
+
cursor.executemany(insert_query, records_to_insert)
|
168 |
+
connection.commit()
|
169 |
+
# 替换 print 为 logger.info
|
170 |
+
logger.info(f"✅ 成功尝试插入 {len(records_to_insert)} 条记录到数据库 (受 INSERT IGNORE 影响,实际插入可能更少)")
|
171 |
+
return True
|
172 |
+
|
173 |
+
except Error as e:
|
174 |
+
# 替换 print 为 logger.error 并包含异常信息
|
175 |
+
logger.error(f"❌ 数据库插入失败: {e}", exc_info=True)
|
176 |
+
if connection.is_connected():
|
177 |
+
connection.rollback() # 如果出错则回滚
|
178 |
+
logger.warning("数据库事务已回滚")
|
179 |
+
return False
|
180 |
+
finally:
|
181 |
+
if connection and connection.is_connected():
|
182 |
+
if cursor: # 确保 cursor 存在再关闭
|
183 |
+
cursor.close()
|
184 |
+
connection.close()
|
185 |
+
# 替换 print 为 logger.info
|
186 |
+
logger.info("ℹ️ 数据库连接已关闭。")
|
187 |
+
|
188 |
+
# 重命名函数并修改逻辑
|
189 |
+
def process_and_insert_updates() -> bool:
|
190 |
+
"""
|
191 |
+
从 compare_data 获取更新数据,处理后插入数据库
|
192 |
+
:return: 处理和插入是否成功
|
193 |
+
"""
|
194 |
+
logger.info("🚀 开始执行数据更新与插入流程...")
|
195 |
+
try:
|
196 |
+
# 1. 调用 compare_data 获取需要处理的数据列表
|
197 |
+
logger.info("ℹ️ 开始从 compare_data 获取待处理数据...")
|
198 |
+
data_to_process = compare_and_generate_updates()
|
199 |
+
|
200 |
+
if data_to_process is None:
|
201 |
+
logger.error("❌ 从 compare_data 获取数据失败。")
|
202 |
+
return False
|
203 |
+
|
204 |
+
if not data_to_process:
|
205 |
+
logger.info("ℹ️ compare_data 没有返回需要处理的数据。")
|
206 |
+
# 即使没有数据,也认为流程是成功的,只是没有工作可做
|
207 |
+
# 但仍尝试调用 insert_to_database 以处理空列表情况并关闭连接
|
208 |
+
insert_to_database([])
|
209 |
+
return True
|
210 |
+
|
211 |
+
logger.info(f"ℹ️ 从 compare_data 成功获取 {len(data_to_process)} 条待处理记录。")
|
212 |
+
|
213 |
+
result_list = []
|
214 |
+
for item in data_to_process:
|
215 |
+
# 2. 提取和验证字段 (compare_data 返回的结构已基本符合要求)
|
216 |
+
# 我们主要需要处理 end_time 和格式化时间
|
217 |
+
extracted = {
|
218 |
+
"message_id": item.get("message_id", ""), # 保留 message_id 以便调试或未来使用
|
219 |
+
"date": item.get("date", "null"),
|
220 |
+
"location": item.get("location", ""),
|
221 |
+
"end_time": item.get("end_time"), # 先获取原始值
|
222 |
+
"todo_content": item.get("todo_content", ""),
|
223 |
+
"user_id": item.get("user_id", ""),
|
224 |
+
"urgency": item.get("urgency", "unimportant")
|
225 |
+
}
|
226 |
+
|
227 |
+
# 3. 处理 end_time 字段
|
228 |
+
extracted = process_end_time(extracted)
|
229 |
+
|
230 |
+
# 4. 统一格式化日期字段 (插入数据库时会再次格式化,此步可选,但保持一致性)
|
231 |
+
for time_field in ["date", "end_time"]:
|
232 |
+
current_val = extracted.get(time_field)
|
233 |
+
if current_val and current_val != "null":
|
234 |
+
try:
|
235 |
+
# 尝试解析以验证格式,并转为 ISO 格式
|
236 |
+
time_obj = parse(current_val)
|
237 |
+
extracted[time_field] = time_obj.isoformat()
|
238 |
+
except (ValueError, TypeError):
|
239 |
+
# 如果解析失败,标记为 "null"
|
240 |
+
# 替换 print 为 logger.warning
|
241 |
+
logger.warning(f"⚠️ 警告:无法解析字段 '{time_field}' 的值 '{current_val}',将设为 null。记录:{item}")
|
242 |
+
extracted[time_field] = "null"
|
243 |
+
|
244 |
+
|
245 |
+
result_list.append(extracted)
|
246 |
+
|
247 |
+
# 5. 将数据插入数据库
|
248 |
+
logger.info(f"ℹ️ 准备将处理后的 {len(result_list)} 条记录插入数据库...")
|
249 |
+
if not insert_to_database(result_list):
|
250 |
+
logger.error("❌ 数据插入数据库失败。")
|
251 |
+
return False # 插入失败则整个流程失败
|
252 |
+
|
253 |
+
logger.info("✅ 数据处理和插入流程成功完成。")
|
254 |
+
return True
|
255 |
+
|
256 |
+
except Exception as e:
|
257 |
+
# 替换 print 为 logger.exception (自动包含 traceback)
|
258 |
+
logger.exception(f"❌ 处理和插入过程中发生未预期错误: {str(e)}")
|
259 |
+
# print(traceback.format_exc()) # 不再需要手动打印
|
260 |
+
return False
|
261 |
+
|
262 |
+
def compare_data() -> dict:
|
263 |
+
# ... (读取 result1.json 和 filter_data.json 的代码) ...
|
264 |
+
try:
|
265 |
+
# ... (json.load 代码) ...
|
266 |
+
logger.info("✅ 成功加载 result1.json 和 filter_data.json")
|
267 |
+
except Exception as e:
|
268 |
+
logger.error(f"❌ 加载 JSON 文件时出错: {e}", exc_info=True)
|
269 |
+
return [] # 或者 None,取决于 process_and_insert_updates 如何处理
|
270 |
+
|
271 |
+
# --- !! 关键在这里:确保 combined_data 被初始化 !! ---
|
272 |
+
combined_data = {} # <--- 这一行非常重要,它定义了变量并初始化为空字典
|
273 |
+
updates_to_insert = []
|
274 |
+
# --- !! 结束关键点 !! ---
|
275 |
+
|
276 |
+
logger.info("🔍 开始比较数据并生成更新...")
|
277 |
+
|
278 |
+
logger.info("📊 数据比较完成。")
|
279 |
+
|
280 |
+
# === 恢复标准保存 todolist_export.json 逻辑 ===
|
281 |
+
export_filename = "todolist_export.json"
|
282 |
+
logger.info(f"准备导出合并后的数据到配置文件指定的目录...")
|
283 |
+
if combined_data:
|
284 |
+
try:
|
285 |
+
paths_config = get_paths()
|
286 |
+
output_dir_rel = paths_config.get('data_dir', 'output') # 使用 config 值 "output"
|
287 |
+
script_dir = Path(__file__).resolve().parent
|
288 |
+
output_dir_abs = script_dir / output_dir_rel # 构建 todogen_LLM/output
|
289 |
+
export_file_path = output_dir_abs / export_filename
|
290 |
+
|
291 |
+
# +++ 添加诊断日志 +++
|
292 |
+
logger.critical(f"JIAOBEN SAVE EXPORT: Target directory: {output_dir_abs.resolve()}")
|
293 |
+
logger.critical(f"JIAOBEN SAVE EXPORT: Target file: {export_file_path.resolve()}")
|
294 |
+
# +++ 结束添加 +++
|
295 |
+
|
296 |
+
output_dir_abs.mkdir(parents=True, exist_ok=True)
|
297 |
+
|
298 |
+
with open(export_file_path, 'w', encoding='utf-8') as f:
|
299 |
+
json.dump(combined_data, f, ensure_ascii=False, indent=2)
|
300 |
+
logger.info(f"✅ 合并数据已成功导出至: {export_file_path}")
|
301 |
+
except KeyError as e:
|
302 |
+
logger.error(f"❌ config.yaml 中缺少路径配置项 'data_dir': {e}")
|
303 |
+
except Exception as e:
|
304 |
+
logger.error(f"❌ 保存 {export_filename} 时发生错误: {e}", exc_info=True)
|
305 |
+
else:
|
306 |
+
logger.warning(f"没有合并数据可以导出到 {export_filename}。")
|
307 |
+
# === 结束恢复 ===
|
308 |
+
|
309 |
+
logger.info(f"compare_data 函数准备返回 {len(updates_to_insert)} 条待插入记录。")
|
310 |
+
return updates_to_insert
|
311 |
+
|
312 |
+
if __name__ == "__main__":
|
313 |
+
logger.info("🚀 脚本入口:开始执行数据更新与插入流程...")
|
314 |
+
# 执行处理和插入流程
|
315 |
+
if process_and_insert_updates():
|
316 |
+
logger.info("🎉 流程执行完毕。")
|
317 |
+
else:
|
318 |
+
logger.error("🔥 处理流程失败,请检查日志中的错误信息。")
|
LLM/todogen_LLM/logger_config.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# logger_config.py
|
2 |
+
import logging
|
3 |
+
from config_loader import get_paths
|
4 |
+
import os
|
5 |
+
import datetime
|
6 |
+
from logging.handlers import RotatingFileHandler # Use RotatingFileHandler for potential future size limits
|
7 |
+
|
8 |
+
LOG_FOLDER = "todogen_LLM/todogen_logging"
|
9 |
+
LOG_LEVEL = logging.INFO # Default level, can be changed (e.g., logging.DEBUG)
|
10 |
+
|
11 |
+
def setup_logging():
|
12 |
+
"""Configures the logging system."""
|
13 |
+
try:
|
14 |
+
#
|
15 |
+
paths = get_paths()
|
16 |
+
log_dir = paths['logging_dir']
|
17 |
+
|
18 |
+
# 自动创建日志目录
|
19 |
+
log_dir.mkdir(parents=True, exist_ok=True)
|
20 |
+
|
21 |
+
# 修改日志路径生成逻辑
|
22 |
+
today_str = datetime.date.today().strftime('%Y-%m-%d')
|
23 |
+
log_filename = log_dir / f"{today_str}.log"
|
24 |
+
|
25 |
+
# --- Create Formatter ---
|
26 |
+
# Example format: 2023-10-27 15:30:00,123 - INFO - module_name - Log message
|
27 |
+
log_format = logging.Formatter(
|
28 |
+
'%(asctime)s - %(levelname)s - [%(name)s:%(lineno)d] - %(message)s',
|
29 |
+
datefmt='%Y-%m-%d %H:%M:%S'
|
30 |
+
)
|
31 |
+
|
32 |
+
# --- Get Root Logger ---
|
33 |
+
# Configure the root logger - other modules will inherit this
|
34 |
+
logger = logging.getLogger()
|
35 |
+
logger.setLevel(LOG_LEVEL)
|
36 |
+
|
37 |
+
# --- Avoid Adding Handlers Multiple Times ---
|
38 |
+
# Check if handlers already exist to prevent duplication if setup_logging is called more than once
|
39 |
+
if not logger.handlers:
|
40 |
+
# --- File Handler (Appends to daily log file) ---
|
41 |
+
file_handler = logging.FileHandler(log_filename, mode='a', encoding='utf-8')
|
42 |
+
file_handler.setLevel(LOG_LEVEL)
|
43 |
+
file_handler.setFormatter(log_format)
|
44 |
+
logger.addHandler(file_handler)
|
45 |
+
|
46 |
+
# --- Console Handler (Optional - for seeing logs in the terminal) ---
|
47 |
+
console_handler = logging.StreamHandler()
|
48 |
+
console_handler.setLevel(LOG_LEVEL) # Or set a different level for console, e.g., WARNING
|
49 |
+
console_handler.setFormatter(log_format)
|
50 |
+
logger.addHandler(console_handler)
|
51 |
+
|
52 |
+
logger.info("Logging setup complete. Logging to %s", log_filename)
|
53 |
+
else:
|
54 |
+
# This might happen if setup is called again, which shouldn't normally occur
|
55 |
+
# but this prevents duplicate handlers just in case.
|
56 |
+
pass
|
57 |
+
# logger.debug("Logging already configured.")
|
58 |
+
|
59 |
+
|
60 |
+
except Exception as e:
|
61 |
+
# Fallback basic logging if setup fails
|
62 |
+
logging.basicConfig(level=logging.ERROR)
|
63 |
+
logging.error("Failed to configure logging: %s", e, exc_info=True)
|
64 |
+
# Re-raise the exception if needed, or handle appropriately
|
65 |
+
# raise
|
66 |
+
|
67 |
+
# Optional: Call setup immediately when this module is imported?
|
68 |
+
# Or better, call it explicitly from the main entry point script.
|
69 |
+
# setup_logging() # Avoid calling here, call from main script instead.
|