Spaces:
Running
Running
File size: 117,973 Bytes
72d2506 e9e9e3a cd72c3f f4b250d aacf1d8 e9e9e3a 6a18fe1 cd72c3f f55e243 6a18fe1 cd72c3f e9e9e3a 4629ea6 e9e9e3a 6a18fe1 b3e1b64 3ac7b5f e9e9e3a 6a18fe1 7c987d0 b3e1b64 e9e9e3a b3e1b64 f55e243 eeeef0c 5c64bb0 eeeef0c 5c64bb0 aacf1d8 72d2506 aacf1d8 eeeef0c e9e9e3a 9412ba0 7e4a1c6 6a18fe1 073dd22 6a18fe1 7e4a1c6 eeeef0c 72d2506 aacf1d8 5c64bb0 f55e243 b5cdfb8 aacf1d8 b51c818 6a18fe1 cd72c3f e9e9e3a b1bdfa0 72d2506 b1bdfa0 72d2506 b1bdfa0 72d2506 b1bdfa0 3ac7b5f e9e9e3a 5c64bb0 e9e9e3a 7e50d9a aacf1d8 f4b250d aacf1d8 9412ba0 7e50d9a 3ac7b5f c83a4a7 72d2506 7e50d9a 72d2506 7c987d0 7e50d9a aacf1d8 7e50d9a 7c987d0 c83a4a7 aacf1d8 b5cdfb8 c83a4a7 b5cdfb8 c83a4a7 aacf1d8 7e50d9a aacf1d8 5c64bb0 3ac7b5f 72d2506 7e50d9a aacf1d8 7e50d9a 5c64bb0 f4b250d b51c818 eeeef0c 5c64bb0 d6e3c43 aacf1d8 5c64bb0 b3e1b64 5c64bb0 b3e1b64 5c64bb0 f55e243 aacf1d8 e9e9e3a 5c64bb0 e9e9e3a 5c64bb0 b3e1b64 5c64bb0 3ac7b5f 7c987d0 5c64bb0 f4b250d b3e1b64 c83a4a7 7e50d9a 5c64bb0 b3e1b64 c83a4a7 3ac7b5f 7e50d9a c83a4a7 3ac7b5f 7e50d9a 5c64bb0 7e50d9a 5c64bb0 c83a4a7 5c64bb0 3ac7b5f 7e50d9a 5c64bb0 c83a4a7 b1bdfa0 c83a4a7 5c64bb0 b3e1b64 b1bdfa0 7e50d9a c83a4a7 7e50d9a 5c64bb0 332d5fc 3ac7b5f c83a4a7 332d5fc b1bdfa0 7e50d9a b1bdfa0 5c64bb0 7e50d9a 3ac7b5f b1bdfa0 5c64bb0 c83a4a7 b3e1b64 5c64bb0 7e50d9a c83a4a7 5c64bb0 7e50d9a c83a4a7 7e50d9a b1bdfa0 c83a4a7 7e50d9a 5c64bb0 7e50d9a 5c64bb0 f4b250d 72d2506 aacf1d8 b1bdfa0 aacf1d8 72d2506 aacf1d8 72d2506 aacf1d8 72d2506 aacf1d8 b1bdfa0 72d2506 aacf1d8 72d2506 aacf1d8 72d2506 b1bdfa0 72d2506 aacf1d8 72d2506 aacf1d8 72d2506 aacf1d8 72d2506 aacf1d8 72d2506 aacf1d8 72d2506 aacf1d8 72d2506 aacf1d8 72d2506 aacf1d8 72d2506 aacf1d8 72d2506 aacf1d8 72d2506 aacf1d8 c261e5f 72d2506 aacf1d8 72d2506 aacf1d8 b1bdfa0 c261e5f 72d2506 aacf1d8 72d2506 aacf1d8 f4b250d aacf1d8 f4b250d aacf1d8 f4b250d aacf1d8 b1bdfa0 f4b250d b1bdfa0 aacf1d8 b1bdfa0 7e50d9a b1bdfa0 aacf1d8 f4b250d aacf1d8 0f34792 b1bdfa0 b3e1b64 7e50d9a 72d2506 7e50d9a 72d2506 aacf1d8 72d2506 7e50d9a 72d2506 c83a4a7 72d2506 3ac7b5f aacf1d8 3ac7b5f aacf1d8 c83a4a7 b3e1b64 c83a4a7 aacf1d8 c83a4a7 aacf1d8 c83a4a7 aacf1d8 72d2506 aacf1d8 f4b250d aacf1d8 f4b250d b1bdfa0 f4b250d aacf1d8 f4b250d aacf1d8 f4b250d aacf1d8 f4b250d aacf1d8 f4b250d 72d2506 aacf1d8 b1bdfa0 aacf1d8 f4b250d aacf1d8 f4b250d aacf1d8 f4b250d aacf1d8 f4b250d aacf1d8 f4b250d aacf1d8 72d2506 f4b250d 72d2506 f4b250d 72d2506 aacf1d8 b1bdfa0 aacf1d8 7e50d9a aacf1d8 7e50d9a aacf1d8 72d2506 7e50d9a 72d2506 7e50d9a 72d2506 aacf1d8 7e50d9a aacf1d8 7e50d9a aacf1d8 72d2506 7e50d9a aacf1d8 72d2506 aacf1d8 72d2506 7e50d9a 72d2506 7e50d9a aacf1d8 7e50d9a aacf1d8 7e50d9a aacf1d8 7e50d9a aacf1d8 7e50d9a aacf1d8 7e50d9a f4b250d b5cdfb8 5c64bb0 b5cdfb8 5c64bb0 c83a4a7 5c64bb0 c261e5f 5c64bb0 c261e5f 5c64bb0 72d2506 5c64bb0 aacf1d8 72d2506 46693ee c83a4a7 d6e3c43 5c64bb0 c83a4a7 5c64bb0 72d2506 5c64bb0 72d2506 7e50d9a 46693ee 72d2506 7e50d9a 72d2506 7e50d9a 46693ee 72d2506 7e50d9a 72d2506 5c64bb0 72d2506 5c64bb0 7e50d9a 5c64bb0 72d2506 7e50d9a b5cdfb8 5c64bb0 b5cdfb8 5c64bb0 c83a4a7 5c64bb0 7e50d9a 5c64bb0 7e50d9a 5c64bb0 72d2506 5c64bb0 72d2506 5c64bb0 72d2506 5c64bb0 72d2506 c83a4a7 d6e3c43 5c64bb0 aacf1d8 5c64bb0 aacf1d8 72d2506 c83a4a7 5c64bb0 72d2506 5c64bb0 72d2506 c83a4a7 72d2506 c83a4a7 72d2506 5c64bb0 72d2506 7e50d9a b5cdfb8 aacf1d8 b5cdfb8 5c64bb0 aacf1d8 5c64bb0 c83a4a7 5c64bb0 7e50d9a aacf1d8 c83a4a7 5c64bb0 7e50d9a aacf1d8 c83a4a7 7e50d9a 72d2506 c83a4a7 aacf1d8 5c64bb0 72d2506 aacf1d8 c83a4a7 aacf1d8 f4b250d 5c64bb0 aacf1d8 5c64bb0 72d2506 f4b250d 72d2506 f4b250d 72d2506 f4b250d c83a4a7 f4b250d 72d2506 5c64bb0 72d2506 5c64bb0 72d2506 f4b250d 045ba8b 7e50d9a 72d2506 045ba8b 5c64bb0 f4b250d 5c64bb0 72d2506 5c64bb0 72d2506 c83a4a7 72d2506 f4b250d b1bdfa0 72d2506 aacf1d8 72d2506 aacf1d8 b1bdfa0 aacf1d8 72d2506 aacf1d8 f4b250d b1bdfa0 5c64bb0 7e50d9a f4b250d 5c64bb0 7e50d9a 72d2506 f4b250d b1bdfa0 f4b250d 7e50d9a f4b250d 7e50d9a 72d2506 f4b250d b1bdfa0 f4b250d 7e50d9a f4b250d 7e50d9a 72d2506 7e50d9a b1bdfa0 7e50d9a 72d2506 f4b250d b1bdfa0 7e50d9a 72d2506 7e50d9a 72d2506 f4b250d 72d2506 b1bdfa0 f4b250d 5c64bb0 72d2506 7e50d9a 72d2506 5c64bb0 b1bdfa0 f4b250d 72d2506 f4b250d 72d2506 f4b250d 5c64bb0 72d2506 f4b250d 72d2506 f4b250d 72d2506 b1bdfa0 72d2506 f4b250d 5c64bb0 72d2506 f4b250d 5c64bb0 72d2506 5c64bb0 72d2506 5c64bb0 f4b250d 72d2506 5c64bb0 72d2506 5c64bb0 72d2506 c83a4a7 f4b250d b1bdfa0 d19297a 3ac7b5f b3e1b64 7e50d9a f4b250d d19297a b3e1b64 7e50d9a 72d2506 7e50d9a d19297a 72d2506 d19297a 72d2506 7e50d9a 72d2506 f4b250d 7e50d9a 72d2506 7e50d9a 72d2506 7e50d9a 72d2506 7e50d9a 72d2506 7e50d9a 72d2506 7e50d9a 72d2506 7e50d9a d19297a 72d2506 f4b250d 72d2506 f4b250d 72d2506 d19297a 72d2506 5c64bb0 72d2506 f4b250d 72be728 72d2506 b1bdfa0 c261e5f 72d2506 72be728 72d2506 b1bdfa0 c261e5f 72d2506 6e780a2 72d2506 f4b250d 72d2506 f4b250d 72d2506 d6e3c43 0f34792 f4b250d 5c64bb0 72d2506 f4b250d b1bdfa0 5c64bb0 6a18fe1 5c64bb0 72d2506 aacf1d8 5c64bb0 72d2506 f4b250d 72d2506 7e50d9a 72d2506 f4b250d 72d2506 f4b250d 5c64bb0 6a18fe1 768889e 5c64bb0 7e50d9a 72d2506 6a18fe1 7e50d9a 72d2506 7e50d9a 72d2506 5c64bb0 7e50d9a a1a607a 72d2506 f4b250d 5c64bb0 72d2506 7e50d9a 72d2506 5c64bb0 72d2506 5c64bb0 aacf1d8 72d2506 f4b250d 7e50d9a 72d2506 5c64bb0 073dd22 72d2506 b3e1b64 f4b250d 5c64bb0 aacf1d8 581ea2b 72d2506 c83a4a7 72d2506 581ea2b 72d2506 68d4bf1 b3e1b64 f4b250d 72d2506 f4b250d 72d2506 f4b250d 72d2506 f4b250d 72d2506 c83a4a7 f4b250d 72d2506 f4b250d 72d2506 f4b250d b1bdfa0 72d2506 a4284b2 72d2506 e9e9e3a c83a4a7 72d2506 c261e5f 72d2506 c261e5f 72d2506 c261e5f 72d2506 c83a4a7 72d2506 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 |
# main.py (Corrected PermissionError and Integrated Crawl4AI as Primary)
import os
import re
import logging
import asyncio
import json
import html
import contextlib
import traceback
import urllib.parse # Added for URL encoding
from typing import Optional, Dict, Any, Tuple, Union # Added Union
# --- Frameworks ---
from starlette.applications import Starlette
from starlette.routing import Route
from starlette.responses import PlainTextResponse, JSONResponse, Response
from starlette.requests import Request
# --- Telegram Bot ---
from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup, Bot
from telegram.ext import (
Application,
CommandHandler,
MessageHandler,
filters,
ContextTypes,
CallbackQueryHandler,
)
from telegram.constants import ParseMode
from telegram.error import NetworkError, RetryAfter, TimedOut, BadRequest, TelegramError
from telegram.request import HTTPXRequest, BaseRequest
# --- Other Libraries ---
import httpx
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, before_sleep_log
try:
import lxml
DEFAULT_PARSER = 'lxml'
except ImportError:
DEFAULT_PARSER = 'html.parser'
# --- Google Gemini ---
try:
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold
_gemini_available = True
except ImportError:
genai = None
HarmCategory = None
HarmBlockThreshold = None
_gemini_available = False
# logger will be defined later, log warning after logger setup
# --- Crawl4AI (NEW Primary Scraper) ---
try:
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig, CacheMode, CrawlResult
from crawl4ai.models import MarkdownGenerationResult # Specific import for type hint
_crawl4ai_available = True
except ImportError:
AsyncWebCrawler = None
CrawlerRunConfig = None
BrowserConfig = None
CacheMode = None
CrawlResult = None
MarkdownGenerationResult = None # Corrected typo
_crawl4ai_available = False
# logger will be defined later, log warning after logger setup
# --- Logging Setup ---
logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO )
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("telegram.ext").setLevel(logging.INFO)
logging.getLogger('telegram.bot').setLevel(logging.INFO)
logging.getLogger("urllib3").setLevel(logging.INFO)
logging.getLogger('gunicorn.error').setLevel(logging.INFO)
logging.getLogger('uvicorn').setLevel(logging.INFO)
logging.getLogger('starlette').setLevel(logging.INFO)
if _gemini_available: logging.getLogger("google.ai.generativelanguage").setLevel(logging.WARNING)
# Keep C4AI logs less verbose unless debugging
if _crawl4ai_available: logging.getLogger("crawl4ai").setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
logger.info(f"Logging configured. Using BS4 parser: {DEFAULT_PARSER}")
if not _gemini_available: logger.warning("google-generativeai library not found. Gemini functionality disabled.")
if not _crawl4ai_available: logger.warning("crawl4ai library not found. Primary Web Scraping (Crawl4AI) disabled.")
# --- Global variable for PTB app ---
ptb_app: Optional[Application] = None
# --- Define a writable base directory for Crawl4AI ---
# Use /app which is the WORKDIR in the Dockerfile and is generally writable
CRAWL4AI_BASE_DIR = "/app/.crawl4ai_cache"
if _crawl4ai_available:
try:
os.makedirs(CRAWL4AI_BASE_DIR, exist_ok=True)
logger.info(f"Ensured Crawl4AI base directory exists and is writable: {CRAWL4AI_BASE_DIR}")
except Exception as e:
# Log error but proceed, Crawl4AI might still work without cache/db
logger.error(f"Could not create Crawl4AI base directory {CRAWL4AI_BASE_DIR}: {e}. Crawl4AI caching/DB features might fail.")
# --- Environment Variable Loading & Configuration ---
logger.info("Attempting to load secrets and configuration...")
def get_secret(secret_name):
value = os.environ.get(secret_name)
if value: status = "Found"; log_length = min(len(value), 8); value_start = value[:log_length]; logger.info(f"Secret '{secret_name}': {status} (Value starts with: {value_start}...)")
else: status = "Not Found"; logger.warning(f"Secret '{secret_name}': {status}")
return value
TELEGRAM_TOKEN = get_secret('TELEGRAM_TOKEN')
OPENROUTER_API_KEY = get_secret('OPENROUTER_API_KEY') # Summarizer Fallback
URLTOTEXT_API_KEY = get_secret('URLTOTEXT_API_KEY') # Scrape Fallback 2 (WAS 1)
SUPADATA_API_KEY = get_secret('SUPADATA_API_KEY') # YT Fallback 1
APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN') # YT Fallback 2 + Scrape Fallbacks 5 & 6 (WAS 4 & 5)
RAPIDAPI_KEY = get_secret('RAPIDAPI_KEY') # Scrape Fallbacks 3 & 4 (WAS 2 & 3)
WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
GEMINI_API_KEY = get_secret('GEMINI_API_KEY') # Primary Summarizer
# Models (User can still configure via env vars)
OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-coder-33b-instruct") # Fallback Model
APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts") # Default YT Actor
GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-1.5-flash-latest") # Primary Model
# Specific Actor IDs for Website Scraping Fallbacks
APIFY_CRAWLER_ACTOR_ID = "apify/website-content-crawler" # Fallback 5 (WAS 4)
APIFY_TEXT_SCRAPER_ACTOR_ID = "karamelo/text-scraper-free" # Fallback 6 (WAS 5)
if not TELEGRAM_TOKEN: logger.critical("β FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.")
if not GEMINI_API_KEY: logger.error("β ERROR: GEMINI_API_KEY not found. Primary summarization (Gemini) will fail.")
if not OPENROUTER_API_KEY: logger.warning("β οΈ WARNING: OPENROUTER_API_KEY not found. Fallback summarization will fail.")
if not RAPIDAPI_KEY: logger.warning("β οΈ WARNING: RAPIDAPI_KEY not found. RapidAPI scraping fallbacks (3 & 4) will be unavailable.") # Updated numbers
if not APIFY_API_TOKEN: logger.warning("β οΈ WARNING: APIFY_API_TOKEN not found. YT transcript fallback (2) and Website scraping fallbacks (5 & 6) will be unavailable.") # Updated numbers
_gemini_primary_enabled = _gemini_available and bool(GEMINI_API_KEY)
if not _gemini_available: logger.warning("β οΈ WARNING: google-generativeai library missing. Gemini disabled.")
elif not GEMINI_API_KEY: logger.warning("β οΈ WARNING: GEMINI_API_KEY not found or empty. Gemini disabled.")
_openrouter_fallback_enabled = bool(OPENROUTER_API_KEY)
if not _openrouter_fallback_enabled: logger.warning("β οΈ WARNING: OPENROUTER_API_KEY not found. Fallback disabled.")
_crawl4ai_primary_scrape_enabled = _crawl4ai_available # Check if library loaded
if not _crawl4ai_available: logger.error("β ERROR: crawl4ai library missing. Primary web scraping disabled. Will attempt fallbacks immediately.")
if not URLTOTEXT_API_KEY: logger.warning("Optional secret 'URLTOTEXT_API_KEY' not found. Web scraping fallback 2 unavailable.") # Updated number
if not SUPADATA_API_KEY: logger.warning("Optional secret 'SUPADATA_API_KEY' not found. YT transcript fallback 1 unavailable.")
# APIFY_API_TOKEN warning handled above
# RAPIDAPI_KEY warning handled above
if not WEBHOOK_SECRET: logger.info("Optional secret 'WEBHOOK_SECRET' not found. Webhook security disabled.")
logger.info("Secret loading and configuration check finished.")
logger.info(f"Primary Web Scraper (Crawl4AI): {'ENABLED' if _crawl4ai_primary_scrape_enabled else 'DISABLED - Check Logs for Details'}")
logger.info(f"Using Gemini Model (Primary Summarizer): {GEMINI_MODEL if _gemini_primary_enabled else 'DISABLED'}")
logger.info(f"Using OpenRouter Model (Fallback Summarizer): {OPENROUTER_MODEL if _openrouter_fallback_enabled else 'DISABLED'}")
logger.info(f"Using Apify Actor (YT Default): {APIFY_ACTOR_ID}")
logger.info(f"Using Apify Actor (Web Scrape Fallback 5): {APIFY_CRAWLER_ACTOR_ID}")
logger.info(f"Using Apify Actor (Web Scrape Fallback 6): {APIFY_TEXT_SCRAPER_ACTOR_ID}")
_apify_token_exists = bool(APIFY_API_TOKEN)
_urltotext_key_exists = bool(URLTOTEXT_API_KEY)
_rapidapi_key_exists = bool(RAPIDAPI_KEY)
if _gemini_primary_enabled:
try: genai.configure(api_key=GEMINI_API_KEY); logger.info("Google GenAI client configured successfully.")
except Exception as e: logger.error(f"Failed to configure Google GenAI client: {e}"); _gemini_primary_enabled = False
# --- Retry Decorator (Unchanged) ---
@retry( stop=stop_after_attempt(4), wait=wait_exponential(multiplier=1, min=2, max=15), retry=retry_if_exception_type((NetworkError, RetryAfter, TimedOut, BadRequest)), before_sleep=before_sleep_log(logger, logging.WARNING), reraise=True )
async def retry_bot_operation(func, *args, **kwargs):
try: return await func(*args, **kwargs)
except BadRequest as e:
ignore_errors = [ "message is not modified", "query is too old", "message to edit not found", "chat not found", "bot was blocked by the user", ]
if any(err in str(e).lower() for err in ignore_errors): logger.warning(f"Ignoring non-critical BadRequest: {e}"); return None
logger.error(f"Potentially critical BadRequest: {e}"); raise
except TelegramError as e: logger.warning(f"TelegramError (will retry if applicable): {e}"); raise
except Exception as e: logger.error(f"Unexpected error during bot operation: {e}", exc_info=True); raise
# --- Helper Functions (Unchanged) ---
def is_youtube_url(url):
youtube_regex = re.compile( r'(?:https?://)?(?:www\.)?(?:m\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)/' r'(?:watch\?v=|embed/|v/|shorts/|live/|attribution_link\?a=.*&u=/watch\?v=)?' r'([\w-]{11})' r'(?:\S+)?', re.IGNORECASE)
match = youtube_regex.search(url); logger.debug(f"is_youtube_url '{url}': {bool(match)}"); return bool(match)
def extract_youtube_id(url):
youtube_regex = re.compile( r'(?:https?://)?(?:www\.)?(?:m\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)/' r'(?:watch\?v=|embed/|v/|shorts/|live/|attribution_link\?a=.*&u=/watch\?v=)?' r'([\w-]{11})' r'(?:\S+)?', re.IGNORECASE)
match = youtube_regex.search(url)
if match: video_id = match.group(1); logger.debug(f"Extracted YT ID '{video_id}' from {url}"); return video_id
else: logger.warning(f"Could not extract YT ID from {url}"); return None
# --- Content Fetching Functions ---
# --- YouTube Transcript Fetching (Unchanged) ---
async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[str]:
if not video_id: logger.error("[Supadata] No video_id provided"); return None
if not api_key: logger.error("[Supadata] API key missing."); return None
logger.info(f"[YT Fallback 1] Attempting fetch for video ID: {video_id} via Supadata")
api_endpoint = "https://api.supadata.ai/v1/youtube/transcript"
params = {"videoId": video_id, "format": "text"}; headers = {"X-API-Key": api_key}
try:
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.get(api_endpoint, headers=headers, params=params)
logger.debug(f"[Supadata] Status code {response.status_code} for {video_id}")
if response.status_code == 200:
try:
data = response.json() if response.text else None # Check if text exists before json decode
content = None
if data: content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
if not content and response.text: content = response.text # Fallback to raw text if json parse fails or content key missing
if content and isinstance(content, str): logger.info(f"[Supadata] Success for {video_id}. Length: {len(content)}"); return content.strip()
else: logger.warning(f"[Supadata] Success but content empty/invalid for {video_id}. Response: {response.text[:200]}"); return None
except json.JSONDecodeError: logger.warning(f"[Supadata] Received 200 but failed JSON decode for {video_id}. Using raw text if available. Response: {response.text[:200]}"); return response.text.strip() if response.text else None
except Exception as e: logger.error(f"[Supadata] Error processing success response for {video_id}: {e}", exc_info=True); return None
elif response.status_code in [401, 403]: logger.error(f"[Supadata] Auth error ({response.status_code}). Check API key."); return None
elif response.status_code == 404: logger.warning(f"[Supadata] Not found (404) for {video_id}."); return None
else: logger.error(f"[Supadata] Unexpected status {response.status_code} for {video_id}. Resp: {response.text[:200]}"); return None
except httpx.TimeoutException: logger.error(f"[Supadata] Timeout connecting for {video_id}"); return None
except httpx.RequestError as e:
if "CERTIFICATE_VERIFY_FAILED" in str(e): logger.error(f"[Supadata] SSL Cert Verify Failed for {video_id}: {e}")
else: logger.error(f"[Supadata] Request error for {video_id}: {e}")
return None
except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
"""Fallback YT 2: Fetches YouTube transcript using default Apify Actor."""
global APIFY_ACTOR_ID # Uses the default YT actor ID
if not video_url: logger.error("[Apify YT] No video_url provided"); return None
if not api_token: logger.error("[Apify YT] API token missing."); return None
logger.info(f"[YT Fallback 2] Attempting fetch for URL: {video_url} (Actor: {APIFY_ACTOR_ID})")
sync_items_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/run-sync-get-dataset-items"
params = {"token": api_token}
payload = { "urls": [video_url], "outputFormat": "singleStringText", "maxRetries": 5, "channelHandleBoolean": False, "channelNameBoolean": False, "datePublishedBoolean": False, "relativeDateTextBoolean": False, }
headers = {"Content-Type": "application/json"}
try:
async with httpx.AsyncClient(timeout=120.0) as client:
logger.debug(f"[Apify YT] POST Request to {sync_items_endpoint} for {video_url}")
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=payload)
logger.debug(f"[Apify YT] Received status code {response.status_code} for {video_url}")
if response.status_code == 200:
try:
results = response.json()
if isinstance(results, list) and len(results) > 0:
item = results[0]; content = None
if "captions" in item and isinstance(item["captions"], str): content = item["captions"]
elif "text" in item and isinstance(item["text"], str): content = item["text"]
elif "transcript" in item and isinstance(item["transcript"], str): content = item["transcript"]
elif "captions" in item and isinstance(item["captions"], list):
if len(item["captions"]) > 0 and isinstance(item["captions"][0], dict) and 'text' in item["captions"][0]: content = " ".join(line.get("text", "") for line in item["captions"] if line.get("text"))
elif len(item["captions"]) > 0 and isinstance(item["captions"][0], str): content = " ".join(item["captions"])
if content and isinstance(content, str): logger.info(f"[Apify YT] Success via REST for {video_url}. Length: {len(content)}"); return content.strip()
else: logger.warning(f"[Apify YT] Dataset item parsed but transcript content empty/invalid format for {video_url}. Item keys: {list(item.keys())}"); return None
else: logger.warning(f"[Apify YT] Actor success but dataset was empty for {video_url}. Response: {results}"); return None
except json.JSONDecodeError: logger.error(f"[Apify YT] Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:200]}"); return None
except Exception as e: logger.error(f"[Apify YT] Error processing success response for {video_url}: {e}", exc_info=True); return None
elif response.status_code == 400: logger.error(f"[Apify YT] Bad Request (400) for {video_url}. Check payload. Resp:{response.text[:200]}"); return None
elif response.status_code == 401: logger.error("[Apify YT] Auth error (401). Check token."); return None
elif response.status_code == 404: logger.error(f"[Apify YT] Endpoint/Actor Not Found (404). Actor: {APIFY_ACTOR_ID} Resp:{response.text[:200]}"); return None
else: logger.error(f"[Apify YT] Unexpected status {response.status_code} for {video_url}. Resp:{response.text[:200]}"); return None
except httpx.TimeoutException as e: logger.error(f"[Apify YT] Timeout during API interaction for {video_url}: {e}"); return None
except httpx.HTTPStatusError as e: logger.error(f"[Apify YT] HTTP Status Error during API interaction for {video_url}: {e}"); return None
except httpx.RequestError as e: logger.error(f"[Apify YT] Request error during API interaction for {video_url}: {e}"); return None
except Exception as e: logger.error(f"[Apify YT] Unexpected error during Apify YT call for {video_url}: {e}", exc_info=True); return None
async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]:
global SUPADATA_API_KEY, APIFY_API_TOKEN, _apify_token_exists
if not video_id: logger.error("get_youtube_transcript: No video_id"); return None
logger.info(f"Fetching transcript for video ID: {video_id} (URL: {video_url})")
transcript_text = None
logger.info("[Primary YT] Attempting youtube-transcript-api...")
try:
transcript_list = await asyncio.to_thread( YouTubeTranscriptApi.get_transcript, video_id, languages=['en', 'en-GB', 'en-US'] )
if transcript_list: transcript_text = " ".join([item['text'] for item in transcript_list if 'text' in item])
if transcript_text: logger.info(f"[Primary YT] Success via lib for {video_id} (len: {len(transcript_text)})"); return transcript_text
else: logger.warning(f"[Primary YT] Transcript list/text empty for {video_id}"); transcript_text = None
except NoTranscriptFound: logger.warning(f"[Primary YT] No transcript found via lib for {video_id}.")
except TranscriptsDisabled: logger.warning(f"[Primary YT] Transcripts disabled via lib for {video_id}.")
except Exception as e: logger.warning(f"[Primary YT] Error via lib for {video_id}: {e}"); transcript_text = None
if transcript_text is None:
logger.info("[Fallback YT 1] Trying Supadata API...")
if SUPADATA_API_KEY:
transcript_text = await get_transcript_via_supadata(video_id, SUPADATA_API_KEY)
if transcript_text: logger.info(f"[Fallback YT 1] Success via Supadata for {video_id}"); return transcript_text
else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
if transcript_text is None:
logger.info("[Fallback YT 2] Trying Apify REST API (Default YT Actor)...")
if _apify_token_exists:
transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify Default YT Actor for {video_url}"); return transcript_text
else: logger.warning(f"[Fallback YT 2] Apify Default YT Actor failed or no content for {video_url}.")
else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None
return transcript_text
# --- Website Content Fetching (MODIFIED SECTION) ---
# --- Method 0: Primary Web Scrape (Crawl4AI) ---
async def get_website_content_via_crawl4ai(url: str) -> Optional[str]:
"""Primary Web Method: Fetches and extracts content using Crawl4AI."""
global _crawl4ai_primary_scrape_enabled, CRAWL4AI_BASE_DIR # Use the defined base dir
if not _crawl4ai_primary_scrape_enabled:
logger.warning("[Web Scrape Primary] Crawl4AI called but library/driver is unavailable.")
return None
if not url: logger.error("[Web Scrape Primary] Crawl4AI: No URL provided"); return None
logger.info(f"[Web Scrape Primary] Attempting fetch and extraction via Crawl4AI for: {url}")
# Configure the crawl run - enable cache now
run_config = CrawlerRunConfig(
cache_mode=CacheMode.ENABLED, # Use cache now that base_dir is set
page_timeout=60000, # 60 sec timeout
verbose=False, # Keep logs cleaner
scan_full_page=True, # Try to load dynamic content by scrolling
remove_overlay_elements=True, # Try to remove cookie banners/popups
# Consider adding markdown generation strategy if needed later
# from crawl4ai.content_filter_strategy import PruningContentFilter
# from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
# md_generator = DefaultMarkdownGenerator(content_filter=PruningContentFilter())
# markdown_generator=md_generator,
)
# BrowserConfig defaults are usually okay (headless chromium)
# browser_config = BrowserConfig(headless=True, verbose=False)
extracted_text: Optional[str] = None
try:
# Use context manager and provide base_directory to fix PermissionError
# Pass browser_config if needed: AsyncWebCrawler(config=browser_config, base_directory=CRAWL4AI_BASE_DIR)
async with AsyncWebCrawler(base_directory=CRAWL4AI_BASE_DIR) as crawler:
logger.debug(f"[Web Scrape Primary] Calling Crawl4AI crawler.arun for {url}")
result: CrawlResult = await crawler.arun(url=url, config=run_config)
logger.debug(f"[Web Scrape Primary] Crawl4AI arun completed. Success: {result.success}, Status: {result.status_code}")
if result.success:
# Check for markdown generation result first (preferred)
if result.markdown and isinstance(result.markdown, MarkdownGenerationResult):
# Prioritize 'fit_markdown' if available and substantial
if result.markdown.fit_markdown and isinstance(result.markdown.fit_markdown, str) and len(result.markdown.fit_markdown.strip()) > 50:
extracted_text = result.markdown.fit_markdown.strip()
logger.debug(f"[Web Scrape Primary] Using 'fit_markdown' from MarkdownGenerationResult for {url}")
# Fallback to 'raw_markdown' if 'fit_markdown' is missing/short
elif result.markdown.raw_markdown and isinstance(result.markdown.raw_markdown, str):
extracted_text = result.markdown.raw_markdown.strip()
logger.debug(f"[Web Scrape Primary] Using 'raw_markdown' (fit_markdown unavailable/short) for {url}")
else:
logger.warning(f"[Web Scrape Primary] Markdown object present but no usable text content (fit/raw) for {url}. Trying cleaned_html.")
# Fall through to cleaned_html parsing if markdown is unusable
# Handle if result.markdown is just a string (older version compatibility?)
elif result.markdown and isinstance(result.markdown, str):
extracted_text = result.markdown.strip()
logger.debug(f"[Web Scrape Primary] Using direct result.markdown string for {url}")
# If no markdown or unusable markdown, try parsing cleaned_html
if not extracted_text and result.cleaned_html:
logger.warning(f"[Web Scrape Primary] No usable markdown found, parsing cleaned_html with BS4 for {url}")
try:
# Use a simple BS4 parse as a fallback within Crawl4AI's result
soup = BeautifulSoup(result.cleaned_html, DEFAULT_PARSER)
extracted_text = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
except Exception as bs_err:
logger.error(f"[Web Scrape Primary] Error parsing Crawl4AI's cleaned_html with BS4 for {url}: {bs_err}")
extracted_text = None # Ensure it's None if parsing fails
# Final check on extracted text length
if extracted_text and len(extracted_text) > 50: # Check for meaningful content length
logger.info(f"[Web Scrape Primary] Success via Crawl4AI for {url}. Length: {len(extracted_text)}")
return extracted_text
else:
content_len = len(extracted_text) if extracted_text else 0
logger.warning(f"[Web Scrape Primary] Crawl4AI success but extracted text too short or empty for {url}. Length: {content_len}. Will try fallbacks.")
return None # Return None to trigger fallbacks
else:
error_msg = result.error_message or f"Crawl failed (status code: {result.status_code})"
logger.error(f"[Web Scrape Primary] Crawl4AI failed for {url}. Error: {error_msg}. Will try fallbacks.")
return None # Return None to trigger fallbacks
except asyncio.TimeoutError:
logger.error(f"[Web Scrape Primary] Timeout error during Crawl4AI crawl for {url}. Will try fallbacks.")
return None
except ImportError as ie:
if "playwright" in str(ie).lower():
logger.critical(f"[Web Scrape Primary] Playwright library missing or drivers not installed! Run 'pip install playwright && playwright install --with-deps'. Error: {ie}")
_crawl4ai_primary_scrape_enabled = False # Disable future attempts
else:
logger.error(f"[Web Scrape Primary] Unexpected ImportError during Crawl4AI execution for {url}: {ie}", exc_info=True)
return None # Return None to trigger fallbacks
except Exception as e:
# Catch potential Playwright errors about missing executables explicitly
if "playwright" in str(e).lower() and ("install" in str(e).lower() or "executable" in str(e).lower() or "path" in str(e).lower()):
logger.critical("[Web Scrape Primary] Playwright drivers likely missing! Run 'playwright install --with-deps' in your environment. Disabling Crawl4AI.")
_crawl4ai_primary_scrape_enabled = False # Disable future attempts
else:
logger.error(f"[Web Scrape Primary] Unexpected error during Crawl4AI execution for {url}: {e}", exc_info=True)
return None # Return None to trigger fallbacks
# --- Fallback 1: Direct Fetch + BS4 (Previously Primary) ---
async def fetch_url_content_for_scrape(url: str, timeout: int = 25) -> Optional[str]:
"""Directly fetches URL content using httpx. (Fallback Web Method 1 - Fetching part)"""
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' }
try:
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, headers=headers) as client:
logger.debug(f"[Web Scrape Fallback 1] Sending GET request to {url}")
response = await client.get(url)
logger.debug(f"[Web Scrape Fallback 1] Received response {response.status_code} from {url}")
response.raise_for_status()
content_type = response.headers.get('content-type', '').lower()
if 'html' not in content_type and 'xml' not in content_type:
logger.warning(f"[Web Scrape Fallback 1] Non-HTML/XML content type received from {url}: {content_type}")
if 'text/plain' in content_type: logger.info(f"[Web Scrape Fallback 1] Content type is text/plain for {url}, reading."); return response.text
return None
try: return response.text
except Exception as e: logger.error(f"[Web Scrape Fallback 1] Error decoding response text for {url}: {e}"); return None
except httpx.HTTPStatusError as e: logger.error(f"[Web Scrape Fallback 1] HTTP error {e.response.status_code} fetching {url}: {e}")
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 1] Timeout error fetching {url}")
except httpx.TooManyRedirects: logger.error(f"[Web Scrape Fallback 1] Too many redirects fetching {url}")
except httpx.RequestError as e: logger.error(f"[Web Scrape Fallback 1] Request error fetching {url}: {e}")
except Exception as e: logger.error(f"[Web Scrape Fallback 1] Unexpected error fetching {url}: {e}", exc_info=True)
return None
async def get_website_content_direct_bs4(url: str) -> Optional[str]:
"""Fallback 1: Fetches HTML directly and parses with BeautifulSoup."""
if not url: logger.error("[Web Scrape Fallback 1] No URL provided"); return None
logger.info(f"[Web Scrape Fallback 1] Attempting direct fetch and parse for: {url}")
html_content = await fetch_url_content_for_scrape(url)
if not html_content: logger.warning(f"[Web Scrape Fallback 1] Direct fetch failed for {url}."); return None
try:
def parse_html(content: str) -> Optional[str]:
try:
soup = BeautifulSoup(content, DEFAULT_PARSER)
# More aggressive removal of potentially noisy tags
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "input", "textarea", "select", "option", "iframe", "img", "svg", "link", "meta", "noscript", "figure", "figcaption", "picture", "source", "map", "area", "details", "dialog"]):
element.extract()
# Try common main content containers
main_content = soup.find('main') or soup.find('article') or soup.find(role='main') or soup.find(id=re.compile(r'content|main|body|post', re.I)) or soup.find(class_=re.compile(r'content|main|body|article|post|entry', re.I))
target_element = main_content if main_content else soup.body
if not target_element:
logger.warning(f"[Web Scrape Fallback 1 Parse] Could not find body or main content candidates for {url}")
# Fallback: Get text from the whole soup if no specific container found
text_from_root = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
if text_from_root and len(text_from_root) > 50:
logger.warning(f"[Web Scrape Fallback 1 Parse] Using text from root as fallback for {url}. Length: {len(text_from_root)}")
return text_from_root
return None # Really couldn't find anything useful
# Extract text from the chosen element (main_content or body)
lines = [line.strip() for line in target_element.get_text(separator='\n', strip=True).splitlines() if line.strip()]
text = " ".join(lines)
# Check if the extracted text is meaningful
if not text or len(text) < 50: # Increased threshold slightly
logger.warning(f"[Web Scrape Fallback 1 Parse] Extracted text from target element too short or empty for {url}. Length: {len(text)}")
# As a final attempt, try getting text from the entire soup again
text_from_root_final = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
if text_from_root_final and len(text_from_root_final) > 50:
logger.warning(f"[Web Scrape Fallback 1 Parse] Reverting to text from root as final attempt for {url}. Length: {len(text_from_root_final)}")
return text_from_root_final
return None # Give up if even root text is too short
return text # Return the text from the target element
except Exception as parse_e:
logger.error(f"[Web Scrape Fallback 1 Parse] BS4 parsing error for {url}: {parse_e}", exc_info=False)
return None
# Run parsing in a separate thread to avoid blocking asyncio loop
text_content = await asyncio.to_thread(parse_html, html_content)
if text_content:
logger.info(f"[Web Scrape Fallback 1] Success via direct fetch & parse for {url} (len: {len(text_content)})")
return text_content
else:
logger.warning(f"[Web Scrape Fallback 1] Parsing failed or yielded no meaningful content for {url}.")
return None
except Exception as e:
logger.error(f"[Web Scrape Fallback 1] Unexpected error during parsing phase for {url}: {e}", exc_info=True)
return None
# --- Fallback 2: urltotext.com API ---
async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
"""Fallback 2: Fetches website content using urltotext.com API."""
if not url: logger.error("[Web Scrape Fallback 2] No URL"); return None
if not api_key: logger.error("[Web Scrape Fallback 2] urltotext.com API key missing."); return None
logger.info(f"[Web Scrape Fallback 2] Attempting fetch for: {url} using urltotext.com API")
api_endpoint = "https://urltotext.com/api/v1/urltotext/"
payload = { "url": url, "output_format": "text", "extract_main_content": True, "render_javascript": True, "residential_proxy": False }
headers = { "Authorization": f"Token {api_key}", "Content-Type": "application/json" }
try:
async with httpx.AsyncClient(timeout=45.0) as client:
logger.debug(f"[Web Scrape Fallback 2] Sending request to urltotext.com API for {url}")
response = await client.post(api_endpoint, headers=headers, json=payload)
logger.debug(f"[Web Scrape Fallback 2] Received status {response.status_code} from urltotext.com API for {url}")
if response.status_code == 200:
try:
data = response.json()
content = data.get("data", {}).get("content"); credits = data.get("credits_used", "N/A"); warning = data.get("data", {}).get("warning")
if warning: logger.warning(f"[Web Scrape Fallback 2] urltotext.com API Warning for {url}: {warning}")
if content and isinstance(content, str) and len(content.strip()) > 30: # Check length after stripping
logger.info(f"[Web Scrape Fallback 2] Success via urltotext.com API for {url}. Len: {len(content.strip())}. Credits: {credits}")
return content.strip()
else:
content_len = len(content.strip()) if content and isinstance(content, str) else 0
logger.warning(f"[Web Scrape Fallback 2] urltotext.com API success but content empty/short for {url}. Len: {content_len}. Resp: {data}"); return None
except json.JSONDecodeError: logger.error(f"[Web Scrape Fallback 2] Failed JSON decode urltotext.com for {url}. Resp:{response.text[:500]}"); return None
except Exception as e: logger.error(f"[Web Scrape Fallback 2] Error processing urltotext.com success response for {url}: {e}", exc_info=True); return None
elif response.status_code in [400, 401, 402, 403, 422, 500]: logger.error(f"[Web Scrape Fallback 2] Error {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None
else: logger.error(f"[Web Scrape Fallback 2] Unexpected status {response.status_code} from urltotext.com API for {url}. Resp:{response.text[:200]}"); return None
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 2] Timeout connecting to urltotext.com API for {url}"); return None
except httpx.RequestError as e: logger.error(f"[Web Scrape Fallback 2] Request error connecting to urltotext.com API for {url}: {e}"); return None
except Exception as e: logger.error(f"[Web Scrape Fallback 2] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
# --- Fallback 3: Scraper's Proxy Parser via RapidAPI ---
async def get_website_content_via_scrapers_proxy(url: str, api_key: str) -> Optional[str]:
"""Fallback 3: Fetches website content using Scraper's Proxy Parser via RapidAPI."""
if not url: logger.error("[Web Scrape Fallback 3] No URL provided"); return None
if not api_key: logger.error("[Web Scrape Fallback 3] RapidAPI key missing."); return None
logger.info(f"[Web Scrape Fallback 3] Attempting fetch for: {url} using Scraper's Proxy Parser API")
api_host = "scrapers-proxy2.p.rapidapi.com"
encoded_url = urllib.parse.quote(url, safe='')
api_endpoint = f"https://{api_host}/parser?url={encoded_url}&auto_detect=true"
headers = { "x-rapidapi-host": api_host, "x-rapidapi-key": api_key, "accept-encoding": "gzip" }
try:
async with httpx.AsyncClient(timeout=40.0) as client:
logger.debug(f"[Web Scrape Fallback 3] Sending GET request to {api_host} for {url}")
response = await client.get(api_endpoint, headers=headers)
logger.debug(f"[Web Scrape Fallback 3] Received status {response.status_code} from {api_host} for {url}")
if response.status_code == 200:
try:
data = response.json()
content = data.get("content"); title = data.get("title"); extracted_text = ""
if title and isinstance(title, str): extracted_text += title.strip() + ". "
if content and isinstance(content, str): extracted_text += content.strip()
extracted_text = extracted_text.strip() # Strip final result
if extracted_text and len(extracted_text) > 30:
logger.info(f"[Web Scrape Fallback 3] Success via Scraper's Proxy API for {url}. Len: {len(extracted_text)}")
return extracted_text
else:
logger.warning(f"[Web Scrape Fallback 3] Scraper's Proxy API success but content/title too short/empty for {url}. Keys: {list(data.keys())}. Length: {len(extracted_text)}")
return None
except json.JSONDecodeError: logger.error(f"[Web Scrape Fallback 3] Failed JSON decode Scraper's Proxy API for {url}. Status:{response.status_code}. Resp:{response.text[:500]}"); return None
except Exception as e: logger.error(f"[Web Scrape Fallback 3] Error processing Scraper's Proxy API success response for {url}: {e}", exc_info=True); return None
elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 3] Auth error (401) with {api_host}. Check RapidAPI key."); return None
elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 3] Forbidden (403) from {api_host}. Check subscription/limits."); return None
elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 3] Rate Limit (429) from {api_host}."); return None
elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 3] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None
else: logger.error(f"[Web Scrape Fallback 3] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 3] Timeout connecting to {api_host} API for {url}"); return None
except httpx.RequestError as e: logger.error(f"[Web Scrape Fallback 3] Request error connecting to {api_host} API for {url}: {e}"); return None
except Exception as e: logger.error(f"[Web Scrape Fallback 3] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None
# --- Fallback 4: AI Web Scraper via RapidAPI ---
async def get_website_content_via_ai_web_scraper(url: str, api_key: str) -> Optional[str]:
"""Fallback 4: Fetches website content using AI Web Scraper via RapidAPI."""
if not url: logger.error("[Web Scrape Fallback 4] No URL provided"); return None
if not api_key: logger.error("[Web Scrape Fallback 4] RapidAPI key missing."); return None
logger.info(f"[Web Scrape Fallback 4] Attempting fetch for: {url} using AI Web Scraper API")
api_host = "ai-web-scraper.p.rapidapi.com"; api_endpoint = f"https://{api_host}/extract_content/v1"
headers = { 'Content-Type': 'application/x-www-form-urlencoded', 'x-rapidapi-host': api_host, 'x-rapidapi-key': api_key }
payload = {'url': url}
try:
async with httpx.AsyncClient(timeout=45.0) as client:
logger.debug(f"[Web Scrape Fallback 4] Sending POST request to {api_host} for {url}")
response = await client.post(api_endpoint, headers=headers, data=payload)
logger.debug(f"[Web Scrape Fallback 4] Received status {response.status_code} from {api_host} for {url}")
if response.status_code == 200:
try:
data = response.json(); content = None
if isinstance(data, dict): content = data.get("content") or data.get("text") or data.get("extracted_text") or data.get("result")
elif isinstance(data, str): content = data
if content and isinstance(content, str):
content_stripped = content.strip()
if len(content_stripped) > 30:
logger.info(f"[Web Scrape Fallback 4] Success via AI Web Scraper API for {url}. Len: {len(content_stripped)}")
return content_stripped
else:
logger.warning(f"[Web Scrape Fallback 4] AI Web Scraper API success but content too short after stripping for {url}. Len: {len(content_stripped)}")
return None
else:
keys_info = f"Keys: {list(data.keys())}" if isinstance(data, dict) else f"Type: {type(data)}"
logger.warning(f"[Web Scrape Fallback 4] AI Web Scraper API success but content empty/invalid format for {url}. {keys_info}")
return None
except json.JSONDecodeError:
raw_text = response.text.strip()
if raw_text and len(raw_text) > 30:
logger.warning(f"[Web Scrape Fallback 4] Failed JSON decode for AI Web Scraper, but found raw text. Status:{response.status_code}. Using raw text. Len: {len(raw_text)}")
return raw_text
else:
logger.error(f"[Web Scrape Fallback 4] Failed JSON decode AI Web Scraper API for {url}. Status:{response.status_code}. Resp empty/short:{raw_text[:500]}")
return None
except Exception as e: logger.error(f"[Web Scrape Fallback 4] Error processing AI Web Scraper API success response for {url}: {e}", exc_info=True); return None
elif response.status_code == 401: logger.error(f"[Web Scrape Fallback 4] Auth error (401) with {api_host}. Check RapidAPI key."); return None
elif response.status_code == 403: logger.error(f"[Web Scrape Fallback 4] Forbidden (403) from {api_host}. Check subscription/limits."); return None
elif response.status_code == 429: logger.warning(f"[Web Scrape Fallback 4] Rate Limit (429) from {api_host}."); return None
elif response.status_code >= 500: logger.error(f"[Web Scrape Fallback 4] Server error ({response.status_code}) from {api_host}. Resp:{response.text[:200]}"); return None
else: logger.error(f"[Web Scrape Fallback 4] Unexpected status {response.status_code} from {api_host} API for {url}. Resp:{response.text[:200]}"); return None
except httpx.TimeoutException: logger.error(f"[Web Scrape Fallback 4] Timeout connecting to {api_host} API for {url}"); return None
except httpx.RequestError as e: logger.error(f"[Web Scrape Fallback 4] Request error connecting to {api_host} API for {url}: {e}"); return None
except Exception as e: logger.error(f"[Web Scrape Fallback 4] Unexpected error during {api_host} API call for {url}: {e}", exc_info=True); return None
# --- Fallback 5 & 6: Apify Website Scraping ---
async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: str, actor_name: str, fallback_num: int) -> Optional[str]:
"""Generic function to run an Apify actor and get text content."""
if not url: logger.error(f"[{actor_name} - FB{fallback_num}] No URL provided"); return None
if not api_token: logger.error(f"[{actor_name} - FB{fallback_num}] API token missing."); return None
logger.info(f"[{actor_name} - FB{fallback_num}] Attempting fetch for URL: {url} (Actor: {actor_id})")
sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"; params = {"token": api_token}
# Define different inputs based on actor
run_input: Dict[str, Any]
if actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
# Input for Text Scraper Free
run_input = { "urls": [url] }
logger.debug(f"[{actor_name} - FB{fallback_num}] Using simplified input for Text Scraper: {run_input}")
elif actor_id == APIFY_CRAWLER_ACTOR_ID:
# Input for Website Content Crawler (limit crawl depth)
run_input = {
"startUrls": [{"url": url}],
"maxCrawlPages": 1, # Only crawl the start URL
"maxCrawlDepth": 0, # Do not follow links
"crawlerType": "playwright:firefox", # Or chromium
"maxResults": 1,
# You might need to add parameters to extract specific content if default fails
# e.g., "pageFunction": "async function pageFunction(context) { return { text: document.body.innerText }; }"
}
logger.debug(f"[{actor_name} - FB{fallback_num}] Using input for Website Content Crawler: {run_input}")
else:
logger.error(f"[{actor_name} - FB{fallback_num}] Unknown Apify actor ID: {actor_id}. Cannot determine input format.")
return None
headers = {"Content-Type": "application/json"}
try:
async with httpx.AsyncClient(timeout=180.0) as client: # Increased timeout for Apify actors
logger.debug(f"[{actor_name} - FB{fallback_num}] POST Request to {sync_items_endpoint} for {url}")
response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
logger.debug(f"[{actor_name} - FB{fallback_num}] Received status code {response.status_code} for {url}")
if response.status_code == 200:
try:
results = response.json()
if isinstance(results, list) and len(results) > 0:
item = results[0]; content = None
# Prioritize 'text', then 'content', then 'markdown'
if "text" in item and isinstance(item["text"], str): content = item["text"]
elif "content" in item and isinstance(item["content"], str): content = item["content"]
elif "markdown" in item and isinstance(item["markdown"], str): content = item["markdown"]
# Fallback: Parse 'html' if other fields are missing
elif "html" in item and isinstance(item["html"], str):
logger.warning(f"[{actor_name} - FB{fallback_num}] No 'text', 'content', or 'markdown' found, parsing 'html'.")
try:
soup = BeautifulSoup(item["html"], DEFAULT_PARSER)
content = " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
except Exception as bs_err:
logger.error(f"[{actor_name} - FB{fallback_num}] Error parsing Apify HTML with BS4: {bs_err}")
content = None # Ensure content is None if parsing fails
if content and isinstance(content, str):
content_stripped = content.strip()
if len(content_stripped) > 50: # Increased length check
logger.info(f"[{actor_name} - FB{fallback_num}] Success via REST for {url}. Length: {len(content_stripped)}")
return content_stripped
else:
logger.warning(f"[{actor_name} - FB{fallback_num}] Dataset item parsed but text content too short after stripping for {url}. Length: {len(content_stripped)}")
return None
else:
logger.warning(f"[{actor_name} - FB{fallback_num}] Dataset item parsed but text content empty or invalid format for {url}. Item keys: {list(item.keys())}")
return None
else: logger.warning(f"[{actor_name} - FB{fallback_num}] Actor success but dataset was empty for {url}. Response: {results}"); return None
except json.JSONDecodeError: logger.error(f"[{actor_name} - FB{fallback_num}] Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:200]}"); return None
except Exception as e: logger.error(f"[{actor_name} - FB{fallback_num}] Error processing success response for {url}: {e}", exc_info=True); return None
elif response.status_code == 400: logger.error(f"[{actor_name} - FB{fallback_num}] Bad Request (400) for {url}. Check run_input. Resp:{response.text[:200]}"); return None
elif response.status_code == 401: logger.error(f"[{actor_name} - FB{fallback_num}] Auth error (401). Check token."); return None
elif response.status_code == 404: logger.error(f"[{actor_name} - FB{fallback_num}] Endpoint/Actor Not Found (404). Actor: {actor_id} Resp:{response.text[:200]}"); return None
else: logger.error(f"[{actor_name} - FB{fallback_num}] Unexpected status {response.status_code} for {url}. Resp:{response.text[:200]}"); return None
except httpx.TimeoutException as e: logger.error(f"[{actor_name} - FB{fallback_num}] Timeout during API interaction for {url}: {e}"); return None
except httpx.HTTPStatusError as e: logger.error(f"[{actor_name} - FB{fallback_num}] HTTP Status Error during API interaction for {url}: {e}"); return None
except httpx.RequestError as e: logger.error(f"[{actor_name} - FB{fallback_num}] Request error during API interaction for {url}: {e}"); return None
except Exception as e: logger.error(f"[{actor_name} - FB{fallback_num}] Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
"""Fallback 5: Fetches website content using Apify Website Content Crawler."""
return await _run_apify_actor_for_web_content(
url=url, api_token=api_token, actor_id=APIFY_CRAWLER_ACTOR_ID,
actor_name="Apify Crawler", fallback_num=5
)
async def get_website_content_via_apify_text_scraper(url: str, api_token: str) -> Optional[str]:
"""Fallback 6: Fetches website content using Apify Text Scraper Free."""
return await _run_apify_actor_for_web_content(
url=url, api_token=api_token, actor_id=APIFY_TEXT_SCRAPER_ACTOR_ID,
actor_name="Apify Text Scraper", fallback_num=6
)
# --- Summarization Functions (Unchanged) ---
async def _call_gemini(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
"""Internal function to call Gemini API. Returns (summary, error_message)."""
global GEMINI_MODEL, _gemini_primary_enabled
if not _gemini_primary_enabled:
logger.error("[Gemini Primary] Called but is disabled.");
return None, "Error: Primary AI service (Gemini) not configured/available."
logger.info(f"[Gemini Primary] Generating {summary_type} summary using {GEMINI_MODEL}. Input length: {len(text)}")
if summary_type == "paragraph":
prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST be:\n"
"β’ Clear and simple language suitable for someone unfamiliar with the topic.\n"
"β’ Uses British English spellings throughout.\n"
"β’ Straightforward and understandable vocabulary; avoid complex terms.\n"
"β’ Presented as ONE SINGLE PARAGRAPH.\n"
"β’ No more than 85 words maximum; but does not have to be exactly 85.\n"
"β’ Considers the entire text content equally.\n"
"β’ Uses semicolons (;) instead of em dashes (β or β).\n"
"β’ **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n"
"Here is the text to summarise:")
else: # points summary
prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST strictly follow this format:\n\n"
"β’ For each distinct topic or section identified in the text, create a heading.\n"
"β’ Each heading MUST be plain text without any formatting (e.g., Section Title).\n"
"β’ Immediately following each heading, list the key points as a bulleted list.\n"
"β’ Each bullet point MUST start with a hyphen and a space (- ) on a new line.\n"
"β’ The text within each bullet point should NOT contain any bold formatting.\n"
"β’ IMPORTANT: Never use bold formatting (double asterisks) within the text of the bullet points themselves.\n"
"β’ Use clear, simple, and straightforward language suitable for someone unfamiliar with the topic.\n"
"β’ Use British English spellings throughout.\n"
"β’ Avoid overly complex or advanced vocabulary.\n"
"β’ Keep bullet points concise.\n"
"β’ Ensure the entire summary takes no more than two minutes to read.\n"
"β’ Consider the entire text's content, not just the beginning or a few topics.\n"
"β’ Use semicolons (;) instead of em dashes (β or β).\n"
"β’ **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n"
"Here is the text to summarise:")
# Gemini 1.5 Flash context window is large, but let's keep a reasonable practical limit
MAX_INPUT_LENGTH_GEMINI = 900000 # Approx 1M tokens
if len(text) > MAX_INPUT_LENGTH_GEMINI:
logger.warning(f"[Gemini Primary] Input length ({len(text)}) exceeds limit ({MAX_INPUT_LENGTH_GEMINI}). Truncating.");
text = text[:MAX_INPUT_LENGTH_GEMINI] + "... (Content truncated)"
full_prompt = f"{prompt}\n\n{text}"
safety_settings = { HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE, }
# Check if HARM_CATEGORY_CIVIC_INTEGRITY exists before adding (might vary by SDK version/region)
# if hasattr(HarmCategory, 'HARM_CATEGORY_CIVIC_INTEGRITY'):
# safety_settings[HarmCategory.HARM_CATEGORY_CIVIC_INTEGRITY] = HarmBlockThreshold.BLOCK_NONE
logger.debug(f"[Gemini Primary] Using safety settings: { {k.name: v.name for k, v in safety_settings.items()} }")
try:
logger.debug(f"[Gemini Primary] Initializing model {GEMINI_MODEL}")
model = genai.GenerativeModel(GEMINI_MODEL)
logger.info(f"[Gemini Primary] Sending request to Gemini ({GEMINI_MODEL})...")
request_options = {"timeout": 120} # 120 seconds timeout
response = await model.generate_content_async(
full_prompt,
generation_config=genai.types.GenerationConfig(), # Use default generation config
safety_settings=safety_settings,
request_options=request_options
)
logger.info("[Gemini Primary] Received response from Gemini.")
# Check for blocking based on prompt feedback first
if response.prompt_feedback and response.prompt_feedback.block_reason:
block_reason_str = getattr(response.prompt_feedback.block_reason, 'name', str(response.prompt_feedback.block_reason))
logger.warning(f"[Gemini Primary] Request blocked by API based on prompt feedback. Reason: {block_reason_str}");
return None, f"Sorry, the primary AI model ({GEMINI_MODEL}) blocked the request (Reason: {block_reason_str})."
# If not blocked by prompt, check candidate content and finish reason
summary = None
finish_reason_str = 'UNKNOWN'
safety_block_reason = None
if response.candidates:
candidate = response.candidates[0]
finish_reason_enum = getattr(candidate, 'finish_reason', None)
finish_reason_str = getattr(finish_reason_enum, 'name', 'N/A') if finish_reason_enum else 'N/A'
if finish_reason_str == 'SAFETY':
safety_ratings_str = "N/A"
if hasattr(candidate, 'safety_ratings'):
safety_ratings_str = ", ".join([f"{rating.category.name}: {rating.probability.name}" for rating in candidate.safety_ratings])
safety_block_reason = f"SAFETY (Ratings: [{safety_ratings_str}])"
logger.warning(f"[Gemini Primary] Candidate blocked due to SAFETY. Finish Reason: {finish_reason_str}. {safety_block_reason}")
# Don't return yet, check if response.text fallback works
elif finish_reason_str not in ['STOP', 'MAX_TOKENS', 'N/A', None]: # Log unusual reasons
logger.warning(f"[Gemini Primary] Candidate finished with non-standard reason: {finish_reason_str}")
# Try extracting text from the candidate parts
if candidate.content and candidate.content.parts:
summary = "".join(part.text for part in candidate.content.parts if hasattr(part, 'text'))
# Fallback to response.text if candidate parsing failed or was blocked (but prompt wasn't)
if summary is None:
try:
# This might raise ValueError if the response was fully blocked (e.g., safety)
summary = response.text
if safety_block_reason: # If we got here despite a safety block, log it
logger.warning(f"[Gemini Primary] Got text via response.text despite SAFETY block reason: {safety_block_reason}")
except ValueError as e:
logger.warning(f"[Gemini Primary] Error accessing response.text (likely blocked response): {e}. Final Finish Reason: {finish_reason_str}")
summary = None # Ensure summary is None if .text fails
# Final check and return
if summary:
logger.info(f"[Gemini Primary] Success generating summary. Finish Reason: {finish_reason_str}. Output len: {len(summary)}");
return summary.strip(), None
else:
# Provide a more specific error if safety was the likely cause
error_msg = f"Sorry, the primary AI model ({GEMINI_MODEL}) did not provide a summary (Finish Reason: {finish_reason_str})."
if safety_block_reason:
error_msg = f"Sorry, the primary AI model ({GEMINI_MODEL}) blocked the response due to safety filters ({finish_reason_str})."
logger.warning(f"[Gemini Primary] Gemini returned empty summary or content was blocked. Final Finish Reason: {finish_reason_str}. Safety Block Reason: {safety_block_reason}");
return None, error_msg
except AttributeError as ae:
# This might happen if the SDK response structure changes
logger.error(f"[Gemini Primary] AttributeError during Gemini response processing: {ae}. SDK might be incompatible or response structure unexpected.", exc_info=True)
return None, f"Sorry, error processing response from the primary AI ({GEMINI_MODEL})."
except Exception as e:
# Catch potential network errors, timeouts, etc.
logger.error(f"[Gemini Primary] Unexpected error during Gemini API call: {e}", exc_info=True)
return None, f"Sorry, unexpected error using primary AI ({GEMINI_MODEL})."
async def _call_openrouter(text: str, summary_type: str) -> Tuple[Optional[str], Optional[str]]:
"""Internal function to call OpenRouter API (Fallback). Returns (summary, error_message)."""
global OPENROUTER_API_KEY, OPENROUTER_MODEL, _openrouter_fallback_enabled
if not _openrouter_fallback_enabled:
logger.error("[OpenRouter Fallback] Called but is disabled.");
return None, "Error: Fallback AI service (OpenRouter) not configured/available."
logger.info(f"[OpenRouter Fallback] Generating {summary_type} summary using {OPENROUTER_MODEL}. Input length: {len(text)}")
if summary_type == "paragraph":
prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST be:\n"
"β’ Clear and simple language suitable for someone unfamiliar with the topic.\n"
"β’ Uses British English spellings throughout.\n"
"β’ Straightforward and understandable vocabulary; avoid complex terms.\n"
"β’ Presented as ONE SINGLE PARAGRAPH.\n"
"β’ No more than 85 words maximum; but does not have to be exactly 85.\n"
"β’ Considers the entire text content equally.\n"
"β’ Uses semicolons (;) instead of em dashes (β or β).\n"
"β’ **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n"
"Here is the text to summarise:")
else: # points summary
prompt = ("You are an AI model designed to provide concise summaries using British English spellings. Your output MUST strictly follow this format:\n\n"
"β’ For each distinct topic or section identified in the text, create a heading.\n"
"β’ Each heading MUST be plain text without any formatting (e.g., Section Title).\n"
"β’ Immediately following each heading, list the key points as a bulleted list.\n"
"β’ Each bullet point MUST start with a hyphen and a space (- ) on a new line.\n"
"β’ The text within each bullet point should NOT contain any bold formatting.\n"
"β’ IMPORTANT: Never use bold formatting (double asterisks) within the text of the bullet points themselves.\n"
"β’ Use clear, simple, and straightforward language suitable for someone unfamiliar with the topic.\n"
"β’ Use British English spellings throughout.\n"
"β’ Avoid overly complex or advanced vocabulary.\n"
"β’ Keep bullet points concise.\n"
"β’ Ensure the entire summary takes no more than two minutes to read.\n"
"β’ Consider the entire text's content, not just the beginning or a few topics.\n"
"β’ Use semicolons (;) instead of em dashes (β or β).\n"
"β’ **Focus ONLY on the main content; strictly EXCLUDE information about website features, subscriptions, ads, cookie notices, or navigation elements. Do not include things like free/paid tiers; basic/premium memberships. Especially for ACS membership.**\n\n"
"Here is the text to summarise:")
# Check model context window if known, otherwise use a generous limit
# Deepseek Coder 33B has 16k context, let's aim lower for safety
MAX_INPUT_LENGTH_OR = 60000 # Roughly 15k tokens
if len(text) > MAX_INPUT_LENGTH_OR:
logger.warning(f"[OpenRouter Fallback] Input length ({len(text)}) exceeds estimated limit ({MAX_INPUT_LENGTH_OR}) for {OPENROUTER_MODEL}. Truncating.");
text = text[:MAX_INPUT_LENGTH_OR] + "... (Content truncated)"
full_prompt = f"{prompt}\n\n{text}"
headers = {
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
"Content-Type": "application/json",
# Optional, but good practice for OpenRouter identification
"HTTP-Referer": "https://github.com/your-repo-or-app-name", # Replace with your repo/app URL
"X-Title": "TelegramSummariserBot" # Replace with your app name
}
payload = {
"model": OPENROUTER_MODEL,
"messages": [{"role": "user", "content": full_prompt}]
# Add optional parameters like temperature, max_tokens if needed
# "temperature": 0.7,
# "max_tokens": 1024,
}
openrouter_api_endpoint = "https://openrouter.ai/api/v1/chat/completions"
# Increased read timeout as some models can take time
api_timeouts = httpx.Timeout(connect=10.0, read=90.0, write=10.0, pool=60.0);
response = None
try:
async with httpx.AsyncClient(timeout=api_timeouts) as client:
logger.info(f"[OpenRouter Fallback] Sending request to OpenRouter ({OPENROUTER_MODEL})...")
response = await client.post(openrouter_api_endpoint, headers=headers, json=payload)
logger.info(f"[OpenRouter Fallback] Received response. Status: {response.status_code}")
if response.status_code == 200:
try:
data = response.json()
if data.get("choices") and isinstance(data["choices"], list) and len(data["choices"]) > 0:
choice = data["choices"][0]
message = choice.get("message")
finish_reason = choice.get("finish_reason", "N/A")
if message and isinstance(message, dict):
summary = message.get("content")
if summary:
logger.info(f"[OpenRouter Fallback] Success. Finish: {finish_reason}. Output len: {len(summary)}")
return summary.strip(), None
else:
# Model might return empty content successfully
logger.warning(f"[OpenRouter Fallback] Success but content empty. Finish: {finish_reason}. Resp: {data}")
return None, f"Fallback AI ({OPENROUTER_MODEL}) returned empty summary (Finish: {finish_reason})."
else:
logger.error(f"[OpenRouter Fallback] Unexpected message structure: {message}. Finish: {finish_reason}. Full: {data}")
return None, "Could not parse fallback AI response (message format)."
else:
# Check for specific OpenRouter errors in the response body
error_details = data.get("error", {})
error_msg = error_details.get("message", "Unknown error in response structure")
logger.error(f"[OpenRouter Fallback] Unexpected choices structure or error in response. Error: {error_msg}. Full: {data}")
return None, f"Fallback AI response error: {error_msg}."
except json.JSONDecodeError:
logger.error(f"[OpenRouter Fallback] Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:500]}")
return None, "Failed to understand fallback AI response."
except Exception as e:
logger.error(f"[OpenRouter Fallback] Error processing success response: {e}", exc_info=True)
return None, "Error processing fallback AI response."
# Handle specific HTTP error codes
elif response.status_code == 401:
logger.error("[OpenRouter Fallback] API key invalid or missing (401).")
return None, "Fallback AI authentication failed (check key)."
elif response.status_code == 402:
logger.error("[OpenRouter Fallback] Payment Required/Quota Exceeded (402).")
return None, f"Fallback AI ({OPENROUTER_MODEL}) quota/limit reached."
elif response.status_code == 429:
logger.warning(f"[OpenRouter Fallback] Rate Limit Exceeded (429) for {OPENROUTER_MODEL}.")
return None, f"Fallback AI ({OPENROUTER_MODEL}) is rate-limited. Try again later."
elif response.status_code == 500:
logger.error(f"[OpenRouter Fallback] OpenRouter Internal Server Error (500). Resp:{response.text[:500]}")
return None, f"Fallback AI service ({OPENROUTER_MODEL}) encountered an internal error."
else:
# General unexpected status code
error_info = ""
try: # Try to get error message from JSON response
error_info = response.json().get("error", {}).get("message", "")
except Exception: pass
logger.error(f"[OpenRouter Fallback] Unexpected status {response.status_code}. Error: '{error_info}' Resp:{response.text[:500]}");
return None, f"Fallback AI ({OPENROUTER_MODEL}) returned error status {response.status_code}."
except httpx.TimeoutException as e:
logger.error(f"[OpenRouter Fallback] Timeout error ({type(e)}) connecting to or reading from OpenRouter API: {e}")
return None, f"Fallback AI ({OPENROUTER_MODEL}) timed out."
except httpx.RequestError as e:
logger.error(f"[OpenRouter Fallback] Request error connecting to OpenRouter API: {e}")
return None, "Error connecting to fallback AI service."
except Exception as e:
logger.error(f"[OpenRouter Fallback] Unexpected error during OpenRouter call: {e}", exc_info=True)
return None, "Unexpected error using fallback AI service."
async def generate_summary(text: str, summary_type: str) -> str:
"""Generates summary using Gemini (Primary) and falls back to OpenRouter if needed."""
global _gemini_primary_enabled, _openrouter_fallback_enabled, GEMINI_MODEL, OPENROUTER_MODEL
logger.info(f"[Summary Generation] Starting process. Primary: Gemini ({GEMINI_MODEL}), Fallback: OpenRouter ({OPENROUTER_MODEL})")
final_summary: Optional[str] = None; primary_error_message: Optional[str] = None
if _gemini_primary_enabled:
logger.info(f"[Summary Generation] Attempting primary AI: Gemini ({GEMINI_MODEL})")
final_summary, primary_error_message = await _call_gemini(text, summary_type)
if final_summary: logger.info("[Summary Generation] Success with primary AI (Gemini)."); return final_summary
else: logger.warning(f"[Summary Generation] Primary AI (Gemini) failed. Error: {primary_error_message}. Proceeding to fallback.")
else: logger.warning("[Summary Generation] Primary AI (Gemini) disabled. Proceeding to fallback."); primary_error_message = "Primary AI (Gemini) unavailable."
if _openrouter_fallback_enabled:
logger.info(f"[Summary Generation] Attempting fallback AI: OpenRouter ({OPENROUTER_MODEL})")
fallback_summary, fallback_error_message = await _call_openrouter(text, summary_type)
if fallback_summary: logger.info("[Summary Generation] Success with fallback AI (OpenRouter)."); return fallback_summary
else:
logger.error(f"[Summary Generation] Fallback AI (OpenRouter) also failed. Error: {fallback_error_message}")
# Construct a combined error message
primary_err = primary_error_message or "Primary AI unavailable"
fallback_err = fallback_error_message or "Fallback AI failed with unknown error"
return f"Sorry, summarization failed.\nPrimary: {primary_err}\nFallback ({OPENROUTER_MODEL}): {fallback_err}"
else:
logger.error("[Summary Generation] Fallback AI (OpenRouter) disabled. Cannot proceed.")
if primary_error_message: return f"{primary_error_message} Fallback AI is also unavailable."
else: return "Error: Both primary and fallback AI services are unavailable."
# This line should technically not be reached if logic is sound
logger.error("[Summary Generation] Reached end of function unexpectedly.")
return "Sorry, unknown error during summary generation."
# --- Main Processing Logic (MODIFIED with Crawl4AI and re-ordered fallbacks) ---
async def process_summary_task( user_id: int, chat_id: int, message_id_to_edit: Optional[int], url: str, summary_type: str, bot_token: str ) -> None:
"""Handles the entire process: fetching content (Crawl4AI -> Fallbacks) and summarizing."""
task_id = f"{user_id}-{message_id_to_edit or 'new'}"; logger.info(f"[Task {task_id}] Starting processing for URL: {url}")
background_request: Optional[BaseRequest] = None; bot: Optional[Bot] = None
try:
# Use longer timeouts for the background bot to handle potentially long scrapes/summaries
background_request = HTTPXRequest( connect_timeout=15.0, read_timeout=240.0, write_timeout=60.0, pool_timeout=240.0 )
bot = Bot(token=bot_token, request=background_request)
except Exception as e:
logger.critical(f"[Task {task_id}] Failed to create background bot: {e}", exc_info=True)
# We cannot proceed without a bot instance
return
content: Optional[str] = None
user_feedback_message: Optional[str] = None
success: bool = False
# Use the original button message ID if available, otherwise we'll send a new one
status_message_id: Optional[int] = message_id_to_edit
# Keep track if we sent a *new* message that needs deleting (vs editing the button message)
new_status_message_id : Optional[int] = None
try:
# --- 1. Initial User Feedback ---
processing_message_text = f"Got it! Generating '{summary_type}' summary for:\n`{url}`\n\nFetching content (using primary method... this might take a minute)..."
if status_message_id:
try:
# Edit the message containing the buttons
await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=status_message_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN, reply_markup=None )
logger.debug(f"[Task {task_id}] Edited button message {status_message_id} to 'Processing'")
except (BadRequest, TelegramError) as e:
# Common errors: message not modified, message to edit not found, query too old
logger.warning(f"[Task {task_id}] Could not edit original button message {status_message_id}: {e}. Will send a new status message.")
status_message_id = None # Ensure we send a new message if edit fails
except Exception as e:
logger.error(f"[Task {task_id}] Unexpected error editing button message {status_message_id}: {e}. Will send new.", exc_info=True)
status_message_id = None
# If we couldn't edit the original message, send a new one
if not status_message_id:
try:
status_message = await retry_bot_operation( bot.send_message, chat_id=chat_id, text=processing_message_text, parse_mode=ParseMode.MARKDOWN )
if status_message:
new_status_message_id = status_message.message_id
logger.debug(f"[Task {task_id}] Sent new status message {new_status_message_id}")
else:
# This should ideally not happen due to retry_bot_operation, but handle defensively
raise RuntimeError("Failed to send new status message after retries.")
except Exception as e:
# If we can't even send a status message, we can't proceed meaningfully
logger.error(f"[Task {task_id}] CRITICAL: Failed to send initial status message: {e}. Aborting task.", exc_info=True)
# Attempt to clean up the original button message if it exists
if message_id_to_edit:
try: await retry_bot_operation(bot.delete_message, chat_id=chat_id, message_id=message_id_to_edit)
except Exception: pass
raise # Re-raise to be caught by outer try/finally
# Determine which message ID to update/delete later
message_to_update_id = new_status_message_id or status_message_id
try:
# --- 2. Content Fetching (Chain of methods) ---
# Send typing indicator
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
is_youtube = is_youtube_url(url); logger.debug(f"[Task {task_id}] URL type: {'YouTube' if is_youtube else 'Website'}")
if is_youtube:
# --- YouTube Transcript Logic (Unchanged from original) ---
video_id = extract_youtube_id(url)
if video_id:
content = await get_youtube_transcript(video_id, url)
else:
user_feedback_message = "Sorry, I couldn't understand that YouTube URL format."
# Set feedback message if transcript fetch failed
if not content and not user_feedback_message:
user_feedback_message = "Sorry, I couldn't get the transcript for that YouTube video using any available method (unavailable/private/no captions?)."
else:
# --- Website Scraping Logic (NEW Order: Crawl4AI -> Fallbacks) ---
global URLTOTEXT_API_KEY, RAPIDAPI_KEY, APIFY_API_TOKEN
global _urltotext_key_exists, _rapidapi_key_exists, _apify_token_exists, _crawl4ai_primary_scrape_enabled
# Method 0: Primary Scrape (Crawl4AI)
logger.info(f"[Task {task_id}] Trying Web Scrape Method 0 (Primary: Crawl4AI)...")
if _crawl4ai_primary_scrape_enabled:
content = await get_website_content_via_crawl4ai(url)
if content:
logger.info(f"[Task {task_id}] Method 0 (Crawl4AI) succeeded.")
else:
logger.warning(f"[Task {task_id}] Method 0 (Crawl4AI) failed or returned insufficient content.")
# Edit status message to indicate fallback attempt
if message_to_update_id:
try: await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=message_to_update_id, text="Primary scrape method failed, trying fallbacks...", parse_mode=ParseMode.MARKDOWN)
except Exception: pass # Ignore if edit fails
else:
logger.warning(f"[Task {task_id}] Method 0 (Crawl4AI) skipped - library/driver unavailable.")
# Edit status message
if message_to_update_id:
try: await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=message_to_update_id, text="Primary scrape method unavailable, trying fallbacks...", parse_mode=ParseMode.MARKDOWN)
except Exception: pass
# Method 1: Fallback 1 (Direct Fetch + BS4)
if not content:
logger.warning(f"[Task {task_id}] Method 0 failed/skipped. Trying Method 1 (Direct Fetch + BS4)...")
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
content = await get_website_content_direct_bs4(url)
if not content: logger.warning(f"[Task {task_id}] Method 1 (Direct Fetch + BS4) failed.")
# Method 2: Fallback 2 (urltotext.com)
if not content:
logger.warning(f"[Task {task_id}] Method 1 failed. Trying Method 2 (urltotext.com)...")
if _urltotext_key_exists:
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
content = await get_website_content_via_api(url, URLTOTEXT_API_KEY)
if not content: logger.warning(f"[Task {task_id}] Method 2 (urltotext.com) failed.")
else: logger.warning(f"[Task {task_id}] Method 2 (urltotext.com) API key unavailable. Skipping.")
# Method 3: Fallback 3 (Scraper's Proxy via RapidAPI)
if not content:
logger.warning(f"[Task {task_id}] Method 2 failed. Trying Method 3 (Scraper's Proxy)...")
if _rapidapi_key_exists:
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
content = await get_website_content_via_scrapers_proxy(url, RAPIDAPI_KEY)
if not content: logger.warning(f"[Task {task_id}] Method 3 (Scraper's Proxy) failed.")
else: logger.warning(f"[Task {task_id}] Method 3 (Scraper's Proxy) RapidAPI key unavailable. Skipping.")
# Method 4: Fallback 4 (AI Web Scraper via RapidAPI)
if not content:
logger.warning(f"[Task {task_id}] Method 3 failed. Trying Method 4 (AI Web Scraper)...")
if _rapidapi_key_exists:
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
content = await get_website_content_via_ai_web_scraper(url, RAPIDAPI_KEY)
if not content: logger.warning(f"[Task {task_id}] Method 4 (AI Web Scraper) failed.")
else: logger.warning(f"[Task {task_id}] Method 4 (AI Web Scraper) RapidAPI key unavailable. Skipping.")
# Method 5: Fallback 5 (Apify Website Content Crawler)
if not content:
logger.warning(f"[Task {task_id}] Method 4 failed. Trying Method 5 (Apify Crawler)...")
if _apify_token_exists:
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
content = await get_website_content_via_apify_crawler(url, APIFY_API_TOKEN)
if not content: logger.warning(f"[Task {task_id}] Method 5 (Apify Crawler) failed.")
else: logger.warning(f"[Task {task_id}] Method 5 (Apify Crawler) APIFY_API_TOKEN unavailable. Skipping.")
# Method 6: Fallback 6 (Apify Text Scraper Free)
if not content:
logger.warning(f"[Task {task_id}] Method 5 failed. Trying Method 6 (Apify Text Scraper)...")
if _apify_token_exists:
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
content = await get_website_content_via_apify_text_scraper(url, APIFY_API_TOKEN)
if not content: logger.warning(f"[Task {task_id}] Method 6 (Apify Text Scraper) failed.")
else: logger.warning(f"[Task {task_id}] Method 6 (Apify Text Scraper) APIFY_API_TOKEN unavailable. Skipping.")
# Final check if all website methods failed
if not content and not user_feedback_message:
logger.error(f"[Task {task_id}] All web scraping methods failed for {url}.")
user_feedback_message = "Sorry, I couldn't fetch readable content from that website using multiple methods (blocked/dynamic content/empty?). Even the advanced crawler failed."
# --- 3. Summarization ---
if content:
logger.info(f"[Task {task_id}] Content fetched successfully (len:{len(content)}). Generating '{summary_type}' summary.")
# Update status message before starting potentially long summary generation
if message_to_update_id:
try:
await retry_bot_operation( bot.edit_message_text, chat_id=chat_id, message_id=message_to_update_id, text=f"Content fetched! Now generating '{summary_type}' summary with AI...", parse_mode=ParseMode.MARKDOWN, reply_markup=None )
except Exception as edit_e:
logger.warning(f"[Task {task_id}] Failed to edit status message before summary generation: {edit_e}")
# Send typing indicator again for summary generation
await retry_bot_operation(bot.send_chat_action, chat_id=chat_id, action='typing')
final_summary = await generate_summary(content, summary_type)
# Check if summary generation itself returned an error message
if final_summary.startswith("Error:") or final_summary.startswith("Sorry,"):
user_feedback_message = final_summary # Use the error message from generate_summary
logger.warning(f"[Task {task_id}] Summary generation failed: {final_summary}")
else:
# Summary successful, send it (potentially in parts)
max_length = 4096 # Telegram message length limit
if len(final_summary) <= max_length:
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=final_summary, parse_mode=None, link_preview_options={'is_disabled': True} )
else:
# Split into parts
summary_parts = []
current_part = ""
for line in final_summary.splitlines(keepends=True):
if len(current_part) + len(line) > max_length:
summary_parts.append(current_part)
current_part = line
else:
current_part += line
if current_part: # Add the last part
summary_parts.append(current_part)
logger.info(f"[Task {task_id}] Summary too long ({len(final_summary)} chars), splitting into {len(summary_parts)} parts.")
for i, part in enumerate(summary_parts):
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=part, parse_mode=None, link_preview_options={'is_disabled': True} )
if i < len(summary_parts) - 1:
await asyncio.sleep(0.7) # Short delay between parts
success = True
logger.info(f"[Task {task_id}] Successfully sent summary.")
user_feedback_message = None # Clear any previous potential error message
# --- 4. Handle Final Failure Feedback ---
# If we have a user_feedback_message set at this point, it means something failed
# (either content fetching or summarization)
if user_feedback_message:
logger.warning(f"[Task {task_id}] Process failed. Sending failure feedback: {user_feedback_message}")
# Send the failure message as a new message
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message, link_preview_options={'is_disabled': True} )
except Exception as e:
# Catch any unexpected errors during the main processing block
logger.error(f"[Task {task_id}] Unexpected error during core processing: {e}", exc_info=True)
user_feedback_message = "Oops! Something went wrong while processing your request. Please try again later."
try:
# Try to send a generic error message
await retry_bot_operation( bot.send_message, chat_id=chat_id, text=user_feedback_message )
except Exception as feedback_err:
logger.error(f"[Task {task_id}] Failed even to send the generic error feedback message: {feedback_err}")
success = False # Ensure success is false
except Exception as outer_e:
# Catch critical errors (like failure to send initial status message)
logger.critical(f"[Task {task_id}] Critical outer error prevented task execution: {outer_e}", exc_info=True)
try:
if bot: # Check if bot was initialized
await retry_bot_operation( bot.send_message, chat_id=chat_id, text="β A critical internal error occurred. I couldn't process your request." )
except Exception as crit_feedback_err:
logger.exception(f"[Task {task_id}] Failed even to send the critical error message: {crit_feedback_err}")
success = False # Ensure success is false
finally:
# --- 5. Cleanup ---
# Delete the status message we were updating (either the original button message or the new one we sent)
delete_target_id = new_status_message_id if new_status_message_id else status_message_id
if delete_target_id and bot:
try:
await retry_bot_operation(bot.delete_message, chat_id=chat_id, message_id=delete_target_id)
logger.debug(f"[Task {task_id}] Deleted status/button message {delete_target_id}")
except (BadRequest, TelegramError) as del_e:
# Ignore errors like "message to delete not found"
if "not found" not in str(del_e).lower():
logger.warning(f"[Task {task_id}] Failed to delete status/button message {delete_target_id}: {del_e}")
except Exception as del_e:
logger.warning(f"[Task {task_id}] Unexpected error deleting status/button message {delete_target_id}: {del_e}")
# Close the background bot's HTTPX client if it was created
if background_request and hasattr(background_request, '_client') and background_request._client:
try:
await background_request._client.aclose()
logger.debug(f"[Task {task_id}] Background bot's HTTPX client closed.")
except Exception as close_e:
logger.warning(f"[Task {task_id}] Error closing background bot's client: {close_e}")
logger.info(f"[Task {task_id}] Task finished. Overall Success: {success}")
# --- Telegram Handlers ---
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
user = update.effective_user; mention = user.mention_html()
if not user or not update.message: return
logger.info(f"User {user.id} ({user.username or 'no_username'}) used /start.")
await update.message.reply_html( f"π Hello {mention}! I can summarise YouTube links or website URLs.\n\nJust send me a link anytime!" )
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
user = update.effective_user
if not user or not update.message: return
logger.info(f"User {user.id} ({user.username or 'no_username'}) used /help.")
help_text = ( "π **How to use this bot:**\n\n"
"1. Send me any YouTube video link or website URL.\n"
"2. I'll ask how you want it summarised (paragraph or points).\n"
"3. Click the button for your choice.\n"
"4. Wait while I fetch the content and generate the summary!\n\n"
"βοΈ **Website Scraping:** I use an advanced web crawler (`crawl4ai`) first. If that doesn't work, I'll try several fallback methods (direct fetch, APIs) to get the text.\n"
"πΊ **YouTube:** I try the official library first, then fall back to APIs if needed.\n"
"π€ **Summaries:** I use Google Gemini primarily, with OpenRouter as a backup.\n\n"
"**Commands:**\n"
"`/start` - Display the welcome message\n"
"`/help` - Show this help message" )
await update.message.reply_text(help_text, parse_mode=ParseMode.MARKDOWN)
async def handle_potential_url(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
if not update.message or not update.message.text: return
message_text = update.message.text.strip(); user = update.effective_user
if not user: return
# More robust URL extraction using regex - finds the first http(s) link
url_pattern = re.compile(r"https?://[^\s/$.?#].[^\s]*", re.IGNORECASE)
match = url_pattern.search(message_text)
if match:
extracted_url = match.group(0)
# Clean potential trailing characters like periods or parentheses if message contained more text
extracted_url = extracted_url.rstrip(').,')
logger.info(f"User {user.id} ({user.username or 'no_username'}) sent potential URL: {extracted_url}")
# Store URL and original message ID in user_data for the callback
context.user_data['url_to_summarize'] = extracted_url
context.user_data['original_message_id'] = update.message.message_id # Store original message ID if needed later
keyboard = [[ InlineKeyboardButton("Paragraph Summary", callback_data="paragraph"), InlineKeyboardButton("Points Summary", callback_data="points") ]]
reply_markup = InlineKeyboardMarkup(keyboard)
try:
# Reply to the original message
await update.message.reply_text(
f"Okay, I see this link:\n`{extracted_url}`\n\nHow would you like it summarised?",
reply_markup=reply_markup,
disable_web_page_preview=True,
parse_mode=ParseMode.MARKDOWN
)
except BadRequest as e:
if "chat not found" in str(e).lower() or "bot was blocked by the user" in str(e).lower():
logger.warning(f"Could not reply to user {user.id} (chat not found or blocked).")
else:
logger.error(f"BadRequest replying to URL message from {user.id}: {e}")
except Exception as e:
logger.error(f"Error replying to URL message from {user.id}: {e}", exc_info=True)
else:
# If the message filter passed but regex didn't find a URL, log it but don't reply
logger.debug(f"Ignoring message from {user.id} - Entity filter matched but no URL found by regex: {message_text[:100]}")
async def handle_summary_type_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
query = update.callback_query
if not query or not query.message or not query.from_user:
logger.warning("Callback query received without essential data.")
# Attempt to answer the query even if we can't process it, to remove the loading indicator
if query:
try: await query.answer("Error: Missing data.", show_alert=True)
except Exception: pass
return
user = query.from_user
summary_type = query.data
query_id = query.id
chat_id = query.message.chat_id
message_id_to_edit = query.message.message_id # This is the message with the buttons
try:
# Acknowledge the button press quickly
await query.answer()
logger.debug(f"Acknowledged callback {query_id} from {user.id} for summary type '{summary_type}'")
except BadRequest as e:
if "query is too old" in str(e).lower():
logger.warning(f"Callback query {query_id} is too old to answer. User might have double-clicked or waited too long.")
# Optionally edit the message to indicate the issue if possible
try: await query.edit_message_text(text="This request is too old. Please send the link again.", reply_markup=None)
except Exception: pass
return # Stop processing if the query is too old
else:
# Log other BadRequest errors but attempt to continue if acknowledging failed
logger.error(f"Error answering callback {query_id}: {e}", exc_info=True)
except Exception as e:
logger.error(f"Unexpected error answering callback {query_id}: {e}", exc_info=True)
# Attempt to continue processing even if answering failed
# Retrieve the URL stored in user_data
url = context.user_data.get('url_to_summarize')
logger.info(f"User {user.id} chose '{summary_type}' for button message {message_id_to_edit}. URL in context: {'Yes' if url else 'No'}")
if not url:
logger.warning(f"No URL found in context for user {user.id} (callback query {query_id}). Button might be old or context lost.")
try:
# Edit the button message to inform the user
await query.edit_message_text(
text="Sorry, I couldn't find the original URL for this request (it might be too old or the bot restarted). Please send the link again.",
reply_markup=None # Remove buttons
)
except (BadRequest, TelegramError) as edit_e:
# Ignore errors like "message is not modified" or "message to edit not found"
if "not modified" not in str(edit_e).lower() and "not found" not in str(edit_e).lower():
logger.warning(f"Failed to edit 'URL not found' message {message_id_to_edit} for user {user.id}: {edit_e}")
except Exception as edit_e:
logger.warning(f"Error editing 'URL not found' message {message_id_to_edit} for user {user.id}: {edit_e}")
return # Stop processing if URL is missing
# Clear the URL from context once retrieved to prevent accidental reuse
context.user_data.pop('url_to_summarize', None)
context.user_data.pop('original_message_id', None) # Clear original message ID too
logger.debug(f"Cleared URL context for user {user.id}")
# --- Pre-task Checks ---
global TELEGRAM_TOKEN, _gemini_primary_enabled, _openrouter_fallback_enabled
if not TELEGRAM_TOKEN:
logger.critical("FATAL: TELEGRAM_TOKEN missing when trying to start background task!")
try: await query.edit_message_text(text="β Critical Bot Configuration Error (Missing Token). Cannot proceed.", reply_markup=None)
except Exception: pass
return
if not _gemini_primary_enabled and not _openrouter_fallback_enabled:
logger.critical("FATAL: Neither Gemini nor OpenRouter API keys are configured/valid when trying to start background task!")
try: await query.edit_message_text(text="β Critical AI Configuration Error: No summarization models available. Cannot proceed.", reply_markup=None)
except Exception: pass
return
elif not _gemini_primary_enabled:
logger.warning("Primary AI (Gemini) unavailable, relying solely on fallback for this task.")
elif not _openrouter_fallback_enabled:
logger.warning("Fallback AI (OpenRouter) unavailable, relying solely on primary for this task.")
# --- Schedule Background Task ---
logger.info(f"Scheduling background task for user {user.id}, chat {chat_id}, button message {message_id_to_edit}, url: {url[:60]}...")
asyncio.create_task(
process_summary_task(
user_id=user.id,
chat_id=chat_id,
message_id_to_edit=message_id_to_edit, # Pass the button message ID
url=url,
summary_type=summary_type,
bot_token=TELEGRAM_TOKEN
),
# Name the task for easier debugging if needed
name=f"SummaryTask-{user.id}-{message_id_to_edit}"
)
# Note: The process_summary_task will handle editing/deleting the message_id_to_edit
async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
"""Log Errors caused by Updates."""
logger.error("Exception while handling an update:", exc_info=context.error)
# Optionally add more context if 'update' is an Update object
if isinstance(update, Update) and update.effective_chat:
logger.error(f"Error occurred in chat {update.effective_chat.id}")
# --- Application Setup & Web Framework ---
async def setup_bot_config() -> Application:
logger.info("Configuring Telegram Application..."); global TELEGRAM_TOKEN
if not TELEGRAM_TOKEN: raise ValueError("TELEGRAM_TOKEN missing.")
# Configure HTTPX request settings for the main PTB application
custom_request = HTTPXRequest( connect_timeout=10.0, read_timeout=30.0, write_timeout=30.0, pool_timeout=60.0 )
application = Application.builder().token(TELEGRAM_TOKEN).request(custom_request).build()
# --- Add Handlers ---
application.add_handler(CommandHandler("start", start))
application.add_handler(CommandHandler("help", help_command))
# Use a filter that catches messages containing URL entities
url_filter = filters.Entity("url") | filters.Entity("text_link")
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND & url_filter, handle_potential_url))
# Handler for button clicks (summary type selection)
application.add_handler(CallbackQueryHandler(handle_summary_type_callback))
# Error handler
application.add_error_handler(error_handler)
logger.info("Telegram application handlers configured."); return application
@contextlib.asynccontextmanager
async def lifespan(app: Starlette):
global ptb_app, WEBHOOK_SECRET, TELEGRAM_TOKEN
logger.info("ASGI Lifespan: Startup initiated...");
if not TELEGRAM_TOKEN: logger.critical("TG TOKEN missing."); raise RuntimeError("Telegram token missing.")
bot_setup_successful = False
webhook_set = False
try:
ptb_app = await setup_bot_config()
await ptb_app.initialize()
bot_info = await ptb_app.bot.get_me()
logger.info(f"Bot initialized: @{bot_info.username} (ID: {bot_info.id})")
bot_setup_successful = True # Mark bot setup as successful here
# --- Webhook Setup ---
# Check and delete existing webhook first
current_webhook_info = await ptb_app.bot.get_webhook_info()
if current_webhook_info and current_webhook_info.url:
logger.info(f"Found existing webhook: {current_webhook_info.url}. Attempting to delete...")
try:
if await ptb_app.bot.delete_webhook(drop_pending_updates=True):
logger.info("Existing webhook deleted successfully.")
else:
# API returned False, might not be critical but worth noting
logger.warning("Attempt to delete existing webhook returned False from API.")
except Exception as e:
logger.warning(f"Could not delete existing webhook (Error: {e}). Proceeding with setting new webhook.", exc_info=True)
await asyncio.sleep(1) # Short delay after potential delete
# Determine webhook URL (assuming deployment provides SPACE_HOST)
space_host = os.environ.get("SPACE_HOST")
if not space_host:
logger.critical("SPACE_HOST environment variable not found. Cannot set webhook.")
raise RuntimeError("SPACE_HOST environment variable missing.")
webhook_path = "/webhook" # Matches the route defined later
# Ensure correct protocol and clean host formatting
protocol = "https"
host = space_host.split('://')[-1].rstrip('/') # Remove trailing slashes
full_webhook_url = f"{protocol}://{host}{webhook_path}"
logger.info(f"Calculated webhook URL: {full_webhook_url}")
# Set the new webhook
set_webhook_args = {
"url": full_webhook_url,
"allowed_updates": Update.ALL_TYPES, # Receive all update types
"drop_pending_updates": True # Ignore updates while bot was down
}
if WEBHOOK_SECRET:
set_webhook_args["secret_token"] = WEBHOOK_SECRET
logger.info("Webhook secret token will be used.")
else:
logger.info("No webhook secret token configured.")
# Give network/DNS a moment before setting
await asyncio.sleep(1.5)
logger.info(f"Attempting to set webhook to: {full_webhook_url} with args: {set_webhook_args}")
await ptb_app.bot.set_webhook(**set_webhook_args)
# Verify webhook setup
await asyncio.sleep(1) # Allow time for info propagation
new_webhook_info = await ptb_app.bot.get_webhook_info()
if new_webhook_info.url == full_webhook_url:
logger.info(f"Webhook successfully set: URL='{new_webhook_info.url}', Secret Token Set={bool(WEBHOOK_SECRET)}")
webhook_set = True
else:
logger.error(f"Webhook URL mismatch after setting! Expected '{full_webhook_url}', but GET response shows '{new_webhook_info.url}'. Check firewall/proxy/platform settings.")
# Decide whether to raise an error or try to continue
# For now, let's raise an error as webhook is critical
raise RuntimeError("Failed to verify webhook URL after setting.")
# Start the PTB application processing
await ptb_app.start()
logger.info("PTB Application started and polling for updates via webhook.")
logger.info("ASGI Lifespan: Startup complete."); yield # Application runs here
except Exception as startup_err:
logger.critical(f"Application startup failed: {startup_err}", exc_info=True)
# Attempt cleanup even if startup failed partially
if ptb_app and bot_setup_successful:
if ptb_app.running:
try: await ptb_app.stop()
except Exception as stop_err: logger.error(f"Error stopping PTB app during failed startup: {stop_err}")
# Try to delete webhook if it was potentially set
if webhook_set:
try:
logger.info("Attempting to delete webhook due to startup failure...")
await ptb_app.bot.delete_webhook(drop_pending_updates=True)
logger.info("Webhook deleted during failed startup cleanup.")
except Exception as del_wh_err: logger.error(f"Failed to delete webhook during failed startup cleanup: {del_wh_err}")
try: await ptb_app.shutdown()
except Exception as shutdown_err: logger.error(f"Error shutting down PTB app during failed startup: {shutdown_err}")
raise # Re-raise the original startup error
finally:
# --- Shutdown Logic ---
logger.info("ASGI Lifespan: Shutdown initiated...")
if ptb_app and bot_setup_successful:
# Stop PTB app first
if ptb_app.running:
logger.info("Stopping PTB Application processing...")
try: await ptb_app.stop()
except Exception as e: logger.error(f"Error stopping PTB application: {e}")
else: logger.info("PTB Application was not running.")
# Delete webhook before shutting down fully
try:
logger.info("Attempting to delete webhook on shutdown...")
if ptb_app.bot and hasattr(ptb_app.bot, 'delete_webhook'):
# Check if webhook is actually set before trying to delete
current_wh_info = await ptb_app.bot.get_webhook_info()
if current_wh_info and current_wh_info.url:
if await ptb_app.bot.delete_webhook(drop_pending_updates=True):
logger.info("Webhook deleted successfully on shutdown.")
else:
logger.warning("Failed to delete webhook on shutdown (API returned False).")
else:
logger.info("No webhook was set, skipping deletion.")
else:
logger.warning("Cannot delete webhook: Bot object unavailable or doesn't support delete_webhook.")
except Exception as e:
logger.warning(f"Could not delete webhook during shutdown: {e}", exc_info=False)
# Shutdown PTB application resources
logger.info("Shutting down PTB Application resources...")
try: await ptb_app.shutdown()
except Exception as e: logger.error(f"Error during PTB application shutdown: {e}")
logger.info("PTB Application shut down.")
else:
logger.info("PTB app not fully initialized or setup failed. Skipping PTB shutdown steps.")
logger.info("ASGI Lifespan: Shutdown complete.")
async def health_check(request: Request) -> PlainTextResponse:
"""Simple health check endpoint."""
global OPENROUTER_MODEL, GEMINI_MODEL, APIFY_ACTOR_ID, _apify_token_exists, _gemini_primary_enabled, _openrouter_fallback_enabled
global _urltotext_key_exists, _rapidapi_key_exists, SUPADATA_API_KEY, _crawl4ai_primary_scrape_enabled
bot_status = "Not Initialized"; bot_username = "N/A"
if ptb_app and ptb_app.bot:
try:
# Check if the application is running (processing updates)
app_running = ptb_app.running
# Try to get bot info regardless of running state if bot object exists
bot_info = await ptb_app.bot.get_me()
bot_username = f"@{bot_info.username}" if bot_info and bot_info.username else "Info Fetch Error"
if app_running:
bot_status = "Running"
else:
# If initialized but not running (e.g., during startup/shutdown)
bot_status = "Initialized (Not Processing Updates)"
except (TimedOut, NetworkError) as net_err:
bot_status = f"Network Error checking status: {type(net_err).__name__}"
bot_username = "N/A (Network Error)"
logger.warning(f"Health check: Network error getting bot info: {net_err}")
except Exception as e:
bot_status = f"Error checking status: {type(e).__name__}"
bot_username = "N/A (Error)"
logger.warning(f"Health check: Error getting bot info: {e}", exc_info=False)
elif ptb_app:
bot_status = "Initialized (Bot object missing?)"
bot_username = "N/A"
else:
bot_status = "Not Initialized"
bot_username = "N/A"
# Construct the response string
response_lines = [
f"TG Bot Summariser - Status: {bot_status} ({bot_username})",
"--- Summarization ---",
f"Primary Model (Gemini): {GEMINI_MODEL if _gemini_primary_enabled else 'DISABLED'}",
f"Fallback Model (OpenRouter): {OPENROUTER_MODEL if _openrouter_fallback_enabled else 'DISABLED'}",
"--- YouTube Transcripts ---",
"Primary (Lib): Enabled",
f"Fallback 1 (Supadata): {'Enabled' if SUPADATA_API_KEY else 'Disabled (Key Missing)'}",
f"Fallback 2 (Apify Actor): {APIFY_ACTOR_ID if _apify_token_exists else 'DISABLED (Token Missing)'}",
"--- Website Scraping ---",
f"Primary (Crawl4AI): {'Enabled' if _crawl4ai_primary_scrape_enabled else 'DISABLED (Library/Driver Missing?)'}",
"Fallback 1 (Direct+BS4): Enabled",
f"Fallback 2 (urltotext): {'Enabled' if _urltotext_key_exists else 'Disabled (Key Missing)'}",
f"Fallback 3/4 (RapidAPI): {'Enabled' if _rapidapi_key_exists else 'Disabled (Key Missing)'}",
f"Fallback 5/6 (Apify Actors): {'Enabled' if _apify_token_exists else 'Disabled (Token Missing)'}"
]
return PlainTextResponse("\n".join(response_lines))
async def telegram_webhook(request: Request) -> Response:
"""Handles incoming updates from Telegram."""
global ptb_app, WEBHOOK_SECRET # Ensure ptb_app is accessible
# --- Basic Checks ---
if not ptb_app:
logger.error("Webhook received but PTB application is not initialized.")
return PlainTextResponse('Bot application not initialized', status_code=503) # Service Unavailable
if not ptb_app.bot:
logger.error("Webhook received but PTB bot object is not available.")
return PlainTextResponse('Bot object not available', status_code=503)
if not ptb_app.running:
logger.warning("Webhook received but PTB application is not running (likely startup/shutdown).")
# Return 200 OK to Telegram to prevent retries, but log the warning.
return PlainTextResponse('Bot not actively processing', status_code=200)
# --- Security Check (Secret Token) ---
if WEBHOOK_SECRET:
token_header = request.headers.get("X-Telegram-Bot-Api-Secret-Token")
if not token_header:
logger.warning("Webhook received request MISSING secret token header, but one is configured.")
return Response(content="Forbidden: Missing secret token", status_code=403)
if token_header != WEBHOOK_SECRET:
logger.warning(f"Webhook received INVALID secret token. Header: '{token_header[:5]}...'")
return Response(content="Forbidden: Invalid secret token", status_code=403)
# If token matches, proceed
# --- Process Update ---
try:
update_data = await request.json()
update = Update.de_json(data=update_data, bot=ptb_app.bot)
logger.debug(f"Processing update_id: {update.update_id} via webhook")
# Use PTB's built-in update processing queue
await ptb_app.process_update(update)
# Return 200 OK to Telegram quickly after queuing the update
return Response(status_code=200)
except json.JSONDecodeError:
logger.error("Webhook received invalid JSON data.")
return PlainTextResponse('Bad Request: Invalid JSON', status_code=400)
except Exception as e:
# Log the error, but return 200 OK to prevent Telegram from resending the faulty update
logger.error(f"Error processing webhook update: {e}", exc_info=True)
return Response(status_code=200)
# --- Starlette App Definition ---
app = Starlette(
debug=False, # Set to False for production
lifespan=lifespan,
routes=[
Route("/", endpoint=health_check, methods=["GET"]),
Route("/webhook", endpoint=telegram_webhook, methods=["POST"]),
]
)
logger.info("Starlette ASGI application created with health check ('/') and Telegram webhook ('/webhook') routes.")
# --- Development Server & Playwright Check ---
if __name__ == '__main__':
import uvicorn
logger.warning("Running in development mode using Uvicorn directly - NOT recommended for production!")
# Check for Playwright installation on startup in dev mode
playwright_installed = False
try:
from playwright.async_api import async_playwright
playwright_installed = True
logger.info("Playwright library found.")
# Optional: Add playwright install command here if needed for dev
# Consider running `playwright install --with-deps` manually in your dev env
except ImportError:
logger.critical("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
logger.critical("Playwright library not found. Crawl4AI (Primary Scraper) WILL FAIL.")
logger.critical("Install it: pip install playwright")
logger.critical("Then install browsers: playwright install --with-deps")
logger.critical("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
# Check Crawl4AI explicitly
if not _crawl4ai_available:
logger.critical("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
logger.critical("Crawl4AI library not found. Primary Scraper WILL BE DISABLED.")
logger.critical("Install it: pip install crawl4ai")
logger.critical("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
elif not playwright_installed:
logger.warning("Crawl4AI is installed, but Playwright is missing. Crawl4AI will likely fail without Playwright drivers.")
# Get log level and port from environment or use defaults
log_level = os.environ.get("LOGGING_LEVEL", "info").lower()
local_port = int(os.environ.get('PORT', 8080)) # Use PORT env var, default 8080
# Run Uvicorn
uvicorn.run(
"__main__:app",
host='0.0.0.0', # Listen on all interfaces
port=local_port,
log_level=log_level,
reload=True # Enable auto-reload for development
) |