diff --git "a/data/indices/basic/docstore.json" "b/data/indices/basic/docstore.json" new file mode 100644--- /dev/null +++ "b/data/indices/basic/docstore.json" @@ -0,0 +1 @@ +{"docstore/metadata": {"5cdab5af-82e9-4b3d-9db0-1792e6438fd4": {"doc_hash": "789f423aaa59148ae7466c9995a68374a1536300f0e2182e45f3794cf17cc305"}, "a31b64b9-d4c7-4ec6-8ca0-10d085d48205": {"doc_hash": "3b56832993ff14cd8b6c280e07e184e01c998a98408ebe7daf71b0ed2b3aee54", "ref_doc_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4"}, "b17212a2-233f-492c-b53b-6a03c84d9f4f": {"doc_hash": "5c6db34104cad9b5efbc1466c24846e26f456ac8d92bb6c73d678463f81f48ae", "ref_doc_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4"}, "21f04a43-0875-4b86-94e4-93b11843969f": {"doc_hash": "54ae6da0c8b69bc7ce5f40b044c2bbd059bbc08d3b6a57a06583d33f3c5fe5f8", "ref_doc_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4"}, "35cb1ff3-4a4f-4616-a7c1-af26f0b9d1e9": {"doc_hash": "45bd85e9e5db01833c8a319827d4043e745c4d045e2824994cca618572012a73", "ref_doc_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4"}, "db73e86c-4c24-4992-b2c4-96c3cb23e27e": {"doc_hash": "5284c9255f600848cb3fcb410eb7245db0db438d6a0237780a13304409519fda", "ref_doc_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4"}, "66023780-9b49-46a9-9204-8a4786f537e9": {"doc_hash": "7a96efa56a5b9a4758de999a0a622f41e6bedab39d3d74d51d330870573b51ea", "ref_doc_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4"}, "3ed814ba-a0a4-4328-802d-91780bc5964e": {"doc_hash": "98d5fe81d5949a66b402ddc5726a67433de0bd58cc6d0306e573784fce5a2146", "ref_doc_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4"}, "2ce891bb-b95c-457c-859e-7a8980eb98c2": {"doc_hash": "c035c0c23bb571381c50d07b55536da60311769e924ce720d987eb4cdd4757c0", "ref_doc_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4"}, "04a0e9e2-4399-4f8e-9bb7-4a92ca7f3dfa": {"doc_hash": "715286a1c9b9632ab53c0ff851e79b3cd169d3acbeb9126b8478315310dabbda", "ref_doc_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4"}, "39a9562c-9040-4276-b6a3-0686a44bcf50": {"doc_hash": "268120bcdfd894cbb86409e273db453391b849f2da61f4de300271e6ff8a9d38", "ref_doc_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4"}, "fe41da69-d1ee-4ce6-8888-34e9c9a759b5": {"doc_hash": "60fecee0ddb5e5251efd1b46acbc20d4ad642d9874f309b3dce95e3e88ada004", "ref_doc_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4"}, "5e20f7b6-ac06-4a68-913b-419a2b585f5d": {"doc_hash": "868a157fa8067659291506e48c4406d0828679e3737a948fe8f8ed1374646230", "ref_doc_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4"}, "c0f8c088-1ff8-4839-a325-2522f014510a": {"doc_hash": "648ee93f9c6cacd2f660539d45a3aff80bf95f0905619546b90aae7a155866fa", "ref_doc_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4"}, "e29d9346-c846-4203-8c35-b8bb1e8fa481": {"doc_hash": "327e666629b2b2d724bf1be4145cc85a20fb03ba7b90e1efad4ef19d14b50da2", "ref_doc_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4"}, "004767a1-85b2-466e-b117-b264a4667205": {"doc_hash": "2b3012f90b8a4edc474732a1f36a739f01eafebfbfb08200c5031e86bb15b44e", "ref_doc_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4"}, "bd3c9a8c-f95b-478e-a4a2-01ed7f774d64": {"doc_hash": "85e16b8a0e8695f02bde804b18582770dafff05984046e6af78a6d3018faffc9", "ref_doc_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4"}, "5084cc68-c3d7-4136-83c1-b8c48955719a": {"doc_hash": "d9840d158c2be51dc769a2c15c70a2a36e78b43ca9f8a486a89745767e4b879e", "ref_doc_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4"}, "a1381f9d-d625-43ab-8869-28f2ec055ddd": {"doc_hash": "ee414daa3c426bcf6f3196581f608e236016520542e82688ceebf063fd45feef", "ref_doc_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4"}, "7e4d7d91-e79e-49e5-8c71-8a48fc38cb46": {"doc_hash": "edee2168126ada7c3c4764b1e53036ca689cd0ddc87439c86eee915056ffd0d7", "ref_doc_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4"}, "379dd8d5-bf98-4615-93a9-cba410df45ec": {"doc_hash": "3f1b77f5daba08528b49c812712da76e3346a3163aadfc64c9dfc3d3a212a9f6", "ref_doc_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4"}}, "docstore/data": {"a31b64b9-d4c7-4ec6-8ca0-10d085d48205": {"__data__": {"id_": "a31b64b9-d4c7-4ec6-8ca0-10d085d48205", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4", "node_type": "4", "metadata": {}, "hash": "789f423aaa59148ae7466c9995a68374a1536300f0e2182e45f3794cf17cc305", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "b17212a2-233f-492c-b53b-6a03c84d9f4f", "node_type": "1", "metadata": {}, "hash": "5c6db34104cad9b5efbc1466c24846e26f456ac8d92bb6c73d678463f81f48ae", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "\u00c9COLE POLYTECHNIQUE F\u00c9D\u00c9RALE DE LAUSANNE\nENVIRONMENTAL COMPUTATIONAL SCIENCE AND EARTH OBSERVATION LABORATORY\nAutomated monitoring of insects\nMASTER THESIS\nL\u00e9onard PASI\nQUEBEC ARTIFICIAL INTELLIGENCE INSTITUTE\nSupervisors:\nProf. Devis Tuia (EPFL)\nProf. David Rolnick (MILA)\nSeptember 1, 2023\n\nContents\n1 Introduction 2\n2 Object Detection 6\n2.1 The framework . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 6\n2.1.1 Code improvements and experiment tracking . . . . . . . . . . . . . . . . 6\n2.1.2 Model evaluation, metrics and threshold analysis . . . . . . . . . . . . . . 7\n2.2 The training data . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 9\n2.2.1 The need for new training datasets . . . . . . . . . . . . . . . . . . . . . . . 9\n2.2.2 Garbage in, garbage out . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 10\n2.2.3 Training data synthesis . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 11\n2.3 Model architecture and training recipe . . . . . . . . . . . . . . . . . . . . . . . . . 13\n2.4 Results . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 15\n3 Active Learning 19\n3.1 Measures of uncertainty . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 21\n3.2 Ensemble configurations . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 22\n3.3 Methods . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 23\n3.4 Results . . . . . . . . . . . . . . . . . . . . . . . . .", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 1556, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "b17212a2-233f-492c-b53b-6a03c84d9f4f": {"__data__": {"id_": "b17212a2-233f-492c-b53b-6a03c84d9f4f", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4", "node_type": "4", "metadata": {}, "hash": "789f423aaa59148ae7466c9995a68374a1536300f0e2182e45f3794cf17cc305", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "a31b64b9-d4c7-4ec6-8ca0-10d085d48205", "node_type": "1", "metadata": {}, "hash": "3b56832993ff14cd8b6c280e07e184e01c998a98408ebe7daf71b0ed2b3aee54", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "21f04a43-0875-4b86-94e4-93b11843969f", "node_type": "1", "metadata": {}, "hash": "54ae6da0c8b69bc7ce5f40b044c2bbd059bbc08d3b6a57a06583d33f3c5fe5f8", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": ". . . . . . . . . . . . . . . . . . . . . . . . . . 22\n3.3 Methods . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 23\n3.4 Results . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 24\n3.4.1 Scores distributions . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 24\n3.4.2 Correlation between scores . . . . . . . . . . . . . . . . . . . . . . . . . . . 25\n3.4.3 Visualization of ranked images . . . . . . . . . . . . . . . . . . . . . . . . . 27\n4 Conclusion 30\nBibliography 31\n1\n\nChapter 1\nIntroduction\nT\nhe history of Homo Sapiensas an ecological killer begins earlier than many people imag-\nine. Around 50,000 years ago, Homo Sapiens first set foot in Australia; within a few\nthousand years, of the twenty-four genera of Australian land animals weighing fifty kilo-\ngrams or more, twenty-three became extinct [1]. These includedThylacoleo carnifex, a marsupial\nlion that was the continent\u2019s largest predator; Genyornis newtoni, a flightless bird over two me-\nters in height; and Procoptodon goliah, a giant kangaroo weighing over 200 kg. The colonization\nof the American continent led to a similar ecological catastrophe. Homo Sapiensstarted migrat-\ning south from Alaska about 14,000 years ago. In less than two millennia, humans had reached\nthe continent\u2019s southern tip. Concurrently, 72% and 83% of megafaunal genera become extinct\nin North and South America, respectively [2]. While the relative importance between (natural)\nclimate change and humans is still debated, the latest research suggests that the latter were the\nprincipal or necessary driver of the major extinction events in Australia and South America\n[3, 4].\nToday, the biosphere is undergoing its 6th mass extinction [5, 6]. A mass extinction occurs\nwhen the Earth loses more than three-quarters of its species in a geologically short interval [7];\nthe last one marked the end of the dinosaurs\u2019 era, 66 million years ago, and was caused by a\n10km-wide asteroid colliding with planet Earth. In contrast with all the previous, today\u2019s mass\nextinction is mostly driven by one species, Homo Sapiens. In fact, humans are responsible for\nall the major immediate causes of biotic destruction: habitat conversion, climate disruption,\noverexploitation, toxification, species invasions and disease [5]. The extinctions span numer-\nous species of plants [8] and animals, including mammals, birds, reptiles, amphibians, fish, and\ninsects [9].\nInsects are the most diverse group of animals. They include more than a million described\nspecies and represent more than half of all known living organisms [10]; moreover, it is esti-\nmated that 80% of insect species are still to be discovered [11]. Recent studies have demon-\nstrated alarming rates of insect diversity and abundance loss [12, 13, 14]. However, the data\non changes in insect species diversity and abundance has substantial taxonomic, spatial, and\ntemporal biases and gaps [15].", "mimetype": "text/plain", "start_char_idx": 1339, "end_char_idx": 4324, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "21f04a43-0875-4b86-94e4-93b11843969f": {"__data__": {"id_": "21f04a43-0875-4b86-94e4-93b11843969f", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4", "node_type": "4", "metadata": {}, "hash": "789f423aaa59148ae7466c9995a68374a1536300f0e2182e45f3794cf17cc305", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "b17212a2-233f-492c-b53b-6a03c84d9f4f", "node_type": "1", "metadata": {}, "hash": "5c6db34104cad9b5efbc1466c24846e26f456ac8d92bb6c73d678463f81f48ae", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "35cb1ff3-4a4f-4616-a7c1-af26f0b9d1e9", "node_type": "1", "metadata": {}, "hash": "45bd85e9e5db01833c8a319827d4043e745c4d045e2824994cca618572012a73", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "In fact, humans are responsible for\nall the major immediate causes of biotic destruction: habitat conversion, climate disruption,\noverexploitation, toxification, species invasions and disease [5]. The extinctions span numer-\nous species of plants [8] and animals, including mammals, birds, reptiles, amphibians, fish, and\ninsects [9].\nInsects are the most diverse group of animals. They include more than a million described\nspecies and represent more than half of all known living organisms [10]; moreover, it is esti-\nmated that 80% of insect species are still to be discovered [11]. Recent studies have demon-\nstrated alarming rates of insect diversity and abundance loss [12, 13, 14]. However, the data\non changes in insect species diversity and abundance has substantial taxonomic, spatial, and\ntemporal biases and gaps [15]. A major reason for these short-falls is the inherent difficulty\nof identifying insects: expert knowledge is necessary to classify all insects collected in a trap,\nwhich makes this approach time- or cost-prohibitive to scale, especially as insect identification\nexpertise is in decline [16]. Using indicator species can be an effective approach to sidestep this\nproblem, but doing so often results in inadequate knowledge and compromised measures of\ninterest [17].\n2\n\nIn the last decade, another approach has emerged in species monitoring studies in order to\ndrastically reduce the dependence on manual labor by automatically processing large amounts\nof collected data: the use of deep learning. As part of this trend, in 2021, a team of researchers\nin Denmark published their work on a novel, automatic light trap to monitor moths using\ncomputer vision-based tracking and deep learning [18]. Moths make up the vast majority of\nthe order Lepidoptera, which by itself accounts for around 10% of all described species of living\norganisms. Moths are important as pollinators, herbivores and prey; as such, changes in the\nabundance of moths could have cascading effects through the food web. Additionally, some of\nthe most damaging pest species in agriculture and forestry are also moths [19, 20], suggesting\nthey are a very relevant group of insects to monitor more effectively.\nAs depicted in Fig.1.1, the system presented in [18] consists of a UV light to attract live moths\nduring night hours, a backlit white screen for the moths to rest on, a high-resolution web cam-\nera with a light ring, a computer and a powered junction box with DC-DC converter. A se-\nquence of images is captured and stored on a hard drive whenever a change within the cam-\nera field of view is detected by the computer vision system. On warm summer nights with\na high level of insect activity, more than 20,000 images are captured per night. The images\nare processed off-line through a pipeline that involves four steps. First, object detection is\nperformed with classic computer vision techniques: Otsu\u2019s method to separate foreground\nfrom background, morphological operations to filter out small noisy blobs and close blobs,\nand connected-component labeling. Second, tracking is used to ensure that each insect is only\ncounted once during its stay in the camera field of view. Third, each insect track is classified\nthrough a CNN into ten different classes, representing frequently observed species, groups of\nvery similar species, or false object detections without insects. Fourth, a summary of the indi-\nviduals detected and tracked by the algorithm is derived.\nFigure 1.1: The portable light trap with a light table, a white sheet, and UV light to attract live\nmoths during night hours. The computer vision system consists of a light ring, a camera with\na computer and electronics, and a powered junction box with DC-DC converter. From [18].\nNaturally, moths that fly in and out of the camera field of view will be counted multiple times;\nhowever, it is noted that moths tend to be rather stationary once they land on the sheet. Ad-\n3\n\nditionally, supposing that the average number of visits per individual is fixed over time, this\ndoesn\u2019t prevent from observing trends in species abundance. Hence, the system should be\nconsidered as a viable alternative to traditional methods that typically require tedious manual\nlabor (i.e., visiting the trap several times in a season for observation) and often result in the\nkilling of rare species of insects. Nonetheless, there is vast room for improvement. Around\nthe same time the research in [18] was published, prof.", "mimetype": "text/plain", "start_char_idx": 3494, "end_char_idx": 7967, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "35cb1ff3-4a4f-4616-a7c1-af26f0b9d1e9": {"__data__": {"id_": "35cb1ff3-4a4f-4616-a7c1-af26f0b9d1e9", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4", "node_type": "4", "metadata": {}, "hash": "789f423aaa59148ae7466c9995a68374a1536300f0e2182e45f3794cf17cc305", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "21f04a43-0875-4b86-94e4-93b11843969f", "node_type": "1", "metadata": {}, "hash": "54ae6da0c8b69bc7ce5f40b044c2bbd059bbc08d3b6a57a06583d33f3c5fe5f8", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "db73e86c-4c24-4992-b2c4-96c3cb23e27e", "node_type": "1", "metadata": {}, "hash": "5284c9255f600848cb3fcb410eb7245db0db438d6a0237780a13304409519fda", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "The computer vision system consists of a light ring, a camera with\na computer and electronics, and a powered junction box with DC-DC converter. From [18].\nNaturally, moths that fly in and out of the camera field of view will be counted multiple times;\nhowever, it is noted that moths tend to be rather stationary once they land on the sheet. Ad-\n3\n\nditionally, supposing that the average number of visits per individual is fixed over time, this\ndoesn\u2019t prevent from observing trends in species abundance. Hence, the system should be\nconsidered as a viable alternative to traditional methods that typically require tedious manual\nlabor (i.e., visiting the trap several times in a season for observation) and often result in the\nkilling of rare species of insects. Nonetheless, there is vast room for improvement. Around\nthe same time the research in [18] was published, prof. David Rolnick got involved with the\nproject, and created a team at Mila\u2014of which I was part for this thesis\u2014to work on this chal-\nlenge. In the following paragraphs, I give a brief overview of the main limitations of the system\nas presented in [18], as well as the corresponding solution developed by the team at Mila.\nOne major difficulty in the project was to obtain enough training data for the classifier, espe-\ncially as some of the target species had few occurrences. As the team in [18] resolved to have a\nbalanced dataset, only 250 images per class were selected. Using extensive data augmentation,\nthe dataset was scaled up by a factor of 32, and a ResNet-50 [21] model with pretrained weights\nachieved high accuracy on the validation set 1. However, the performance of the model after\ndeployment dropped, for multiple reasons.\nOne reason was the presence of insect species outside the training dataset. A training dataset\nincluding all possible species in the region would be necessary, but given the huge diversity\nof insects, it would be impossible for a small team of researchers to build such a dataset from\nscratch. The solution proposed by the team at Mila is to use images from iNaturalist and Obser-\nvation.org, accessible through the Global Biodiversity Information Facility (GBIF). Given a list\nof moth species (typically multiple thousands) relevant in a certain region, datasets of hundred\nthousand labelled images can be created and used to train the model. Furthermore, the clas-\nsification task was decomposed in two sub-tasks: first, moth/non-moth binary classification;\nsecond, fine-grained species classification for moths. Both classifiers are trained on GBIF data.\nOther significant sources of errors in the system described in [18] were the object detection and\nthe tracking algorithm. The team at Mila improved both these steps: first, the classical com-\nputer vision techniques were replaced by a deep learning model for object detection; second,\nthe tracking algorithm was enhanced by introducing the similarity in the classifier\u2019s feature\nspace to the similarity metric between detections in consecutive images. The new pipeline is\nsummarized in Fig.1.2.\nFigure 1.2: Current pipeline.\n1There was perhaps a methodological error in the model evaluation. To capture the model performance on new\ndata, the split between training and validation datasets should have been done before augmentation.\n4\n\nFrom these exciting innovations arise new exciting challenges. With an inference time of more\nthan 3 seconds on CPU, the object detector is too slow, and its accuracy can still largely be\nimproved; the work to address these shortcomings will be presented in chapter 2. The new\nclassifier is affected by the domain shift between the GBIF training data and the target data\nfrom moth traps. While strong data augmentation operations have mitigated the problem,\neven better results are expected if the GBIF training datasets can be enhanced with manually\nlabeled images from the traps. The implementation of active learning techniques to make the\nmost of the manual work will be discussed in chapter 3.\n5\n\nChapter 2\nObject Detection\nWhen I joined the team in May, the speed of the object detector was a point of concern. For\nthe collaborators and users of the project without GPUs, the model\u2019s inference time of more\nthan three seconds on CPU was a big limitation. Additionally, there was an idea to transfer\nthe object detection step of the pipeline to the deployments, in order to reduce the amount of\nstored data. To keep the costs down, the deployments would also not be equipped with GPUs.\nFor these two reasons, increasing the object detector speed was very attractive.\nThe object detector was a Faster R-CNN with a ResNet-50-FPN backbone [22, 23], as imple-\nmented in PyTorch\u2019s torchvision package.", "mimetype": "text/plain", "start_char_idx": 7093, "end_char_idx": 11799, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "db73e86c-4c24-4992-b2c4-96c3cb23e27e": {"__data__": {"id_": "db73e86c-4c24-4992-b2c4-96c3cb23e27e", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4", "node_type": "4", "metadata": {}, "hash": "789f423aaa59148ae7466c9995a68374a1536300f0e2182e45f3794cf17cc305", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "35cb1ff3-4a4f-4616-a7c1-af26f0b9d1e9", "node_type": "1", "metadata": {}, "hash": "45bd85e9e5db01833c8a319827d4043e745c4d045e2824994cca618572012a73", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "66023780-9b49-46a9-9204-8a4786f537e9", "node_type": "1", "metadata": {}, "hash": "7a96efa56a5b9a4758de999a0a622f41e6bedab39d3d74d51d330870573b51ea", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "The implementation of active learning techniques to make the\nmost of the manual work will be discussed in chapter 3.\n5\n\nChapter 2\nObject Detection\nWhen I joined the team in May, the speed of the object detector was a point of concern. For\nthe collaborators and users of the project without GPUs, the model\u2019s inference time of more\nthan three seconds on CPU was a big limitation. Additionally, there was an idea to transfer\nthe object detection step of the pipeline to the deployments, in order to reduce the amount of\nstored data. To keep the costs down, the deployments would also not be equipped with GPUs.\nFor these two reasons, increasing the object detector speed was very attractive.\nThe object detector was a Faster R-CNN with a ResNet-50-FPN backbone [22, 23], as imple-\nmented in PyTorch\u2019s torchvision package. Its goal is to localize insects. It was trained on a set\nof images on which bounding boxes could accurately be inferred with classical computer vi-\nsion techniques, due to the low density of moths and clean background. My efforts to improve\nthe model can be grouped around three main axes: the framework, the training data, and the\narchitecture.\n2.1. T HE FRAMEWORK\nHaving a good framework is fundamental to allow efficient development of a model. One\nneeds to be able to easily (i) keep track of experiments and link checkpoints to training runs,\n(ii) integrate new functionalities, and (iii) evaluate the models. As the work on the classification\npart of the pipeline had taken most of the team\u2019s time, none of this was in place for the object\ndetection.\n2.1.1. Code improvements and experiment tracking\nThe code was organized in a number of Jupyter Notebooks and some python modules. Jupyter\nNotebooks are fine for early data analysis and exploration, but they certainly have little place in\na more advanced project. The non-linear workflow and the lack of IDE features (such as linting\nand code styling warnings) favor errors and bad coding practices; additionally, they are terri-\nble for code versioning1, all reasons that make it hard for teammates to collaborate and expand\nthe project. Hence, a full refactor was needed. The result was a few modules for dataset dec-\nlarations and common functions definitions, and two command-line scripts, one for launching\ntrainings, and one for launching inferences. The supported models are all those available in\ntorchvision. The highest standards for code quality were held.\n1This was less of a concern early on, as the team was not using a common repository\n6\n\nDuring development, tens of trainings can be launched in short intervals of time, and the num-\nber of models quickly start to add up. It is important to be able to link each model to a training\nrun, with all its defining parameters. To this end, the popular Weights & Biases platform was\nadopted. The training configuration and the model weights are automatically uploaded to the\nplatform; the model checkpoint is also saved locally, under a name that includes the model\narchitecture and the training run ID assigned by W&B.\n2.1.2. Model evaluation, metrics and threshold analysis\nVisual inspection\nThe only way to evaluate the model was to run inferences and visualize the predicted bounding\nboxes. These were saved directly on copies of the images, which presents two major inconve-\nniences: (i) it is very inefficient in terms of memory, and (ii) it makes it very unpractical to\nvisualize the model performance as a function of the threshold. To address these issues, a sim-\nple python GUI application was created, using the standard tkinter package. A screenshot of\nthe GUI is displayed in Fig.2.1. It is started from command line by running the python script,\nwith two inputs: the path to the json file containing the model\u2019s predictions (which is outputted\nby the inference script), and, optionally, if they are in a different folder, the path to the images.\nThe window presents: (i) a slider to modify the threshold (bounding boxes will appear and\ndisappear accordingly); (ii) the path to the json file (a useful reminder when the app is run in\nparallel to compare models); (iii) the image filename; (iv) a counter indicating the current im-\nage number out of the total; (v) and an entry to modify the step (useful to quickly move across\nhundreds of images). The user moves from one image to the other with the left and right arrow\nkeys. The GUI also accepts ground truths instead of model predictions, in which case the score\nthreshold won\u2019t be displayed.", "mimetype": "text/plain", "start_char_idx": 10980, "end_char_idx": 15474, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "66023780-9b49-46a9-9204-8a4786f537e9": {"__data__": {"id_": "66023780-9b49-46a9-9204-8a4786f537e9", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4", "node_type": "4", "metadata": {}, "hash": "789f423aaa59148ae7466c9995a68374a1536300f0e2182e45f3794cf17cc305", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "db73e86c-4c24-4992-b2c4-96c3cb23e27e", "node_type": "1", "metadata": {}, "hash": "5284c9255f600848cb3fcb410eb7245db0db438d6a0237780a13304409519fda", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "3ed814ba-a0a4-4328-802d-91780bc5964e", "node_type": "1", "metadata": {}, "hash": "98d5fe81d5949a66b402ddc5726a67433de0bd58cc6d0306e573784fce5a2146", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "It is started from command line by running the python script,\nwith two inputs: the path to the json file containing the model\u2019s predictions (which is outputted\nby the inference script), and, optionally, if they are in a different folder, the path to the images.\nThe window presents: (i) a slider to modify the threshold (bounding boxes will appear and\ndisappear accordingly); (ii) the path to the json file (a useful reminder when the app is run in\nparallel to compare models); (iii) the image filename; (iv) a counter indicating the current im-\nage number out of the total; (v) and an entry to modify the step (useful to quickly move across\nhundreds of images). The user moves from one image to the other with the left and right arrow\nkeys. The GUI also accepts ground truths instead of model predictions, in which case the score\nthreshold won\u2019t be displayed.\nFigure 2.1: Screenshot of the GUI app to inspect models\u2019 predictions\n7\n\nMean Average Precision\nThe GUI is a nice tool to inspect the model\u2019s predictions, however it is still a far cry from hav-\ning actual testing datasets and metrics. In fact, visual inspection is both time-consuming and\nlimited in its ability to measure small improvements, while also being affected by confirmation\nbias. With currently ongoing annotation efforts, the test sets will soon finally be available. In\nanticipation of that, I developed the relevant metrics. The mean average precision (mAP) is the\nstandard metric for object detectors. In short, it is obtained by computing the average preci-\nsion (AP) across of set of IoU (Intersection over Union) thresholds for each class and taking the\nmean of all APs. The AP is related to the area under the precision-recall (PR) curve, although\nit isn\u2019t exactly the same. In object detection, the PR curve depends on the IoU threshold, as it\ndetermines how strict the requirement is for a ground truth and a predicted bounding box to\nmatch (hence, it sets the boundary between true positives, false positives, etc.). For each IoU\nthreshold, one PR curve and one AP measure is obtained. The mAP has the big advantage\nof being independent of the IoU threshold, which is important to evaluate models fairly. Its\nimplementation in the torchmetrics library made it easy to integrate in the framework. During\ntrainings, the mAP is computed on the evaluation set at each epoch and uploaded to W&B.\nProvided that the evaluation set is fixed, this offers a quick way to compare training on the\nW&B - much better than using the model\u2019s loss, which can change across models and is not\ninterpretable.\nThreshold analysis\nWhile being a useful metric to compare models, the mAP doesn\u2019t help in setting the optimal\nthreshold of the chosen model, which is a critical task. Not to be confused with the IoU thresh-\nold mentioned above, the score threshold determines whether a predicted bounding box is kept\nin the final prediction or not. It is therefore critical for the model\u2019s performance. A common\nway to choose the score threshold is from the precision recall curve. To make this operation pos-\nsible, the computation of the PR curve was implemented from scratch, and a GUI was created.\nThe GUI is presented in Fig.2.2. It takes as inputs the IoU threshold(s) at which to compute\nthe PR curve, and the paths to the json files containing the ground truths and the model pre-\ndictions. The user can set the model threshold and visualize the corresponding point on the\nPR curve, hence know what precision (i.e. the proportion of correct insect detections among\nall detections) and recall (i.e. the proportion of correct insect detections among all insects on\nthe images) to expect from the model. Given a testing dataset for a specific deployment, this\ninformation would be very valuable for the ecologists using the system.\n8\n\nFigure 2.2: Precision-recall curve for threshold analysis\n2.2. T HE TRAINING DATA\n2.2.1. The need for new training datasets\nAs mentioned before, the Faster R-CNN model was trained on a set of images on which bound-\ning boxes could accurately be inferred with classical computer vision techniques, due to the\nlow density of moths and clean background. A visual inspection of this dataset revealed that\nit is actually not from any of the current deployments 2, and that a different data acquisition\ntechnique was used, leading to way less sharp edges. This is a clear example of data-shift. In\nthe past, quick tests with a SSDlite model \u2014the lightest and faster architecture available on\ntorchvision\u2014, had failed.", "mimetype": "text/plain", "start_char_idx": 14614, "end_char_idx": 19120, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "3ed814ba-a0a4-4328-802d-91780bc5964e": {"__data__": {"id_": "3ed814ba-a0a4-4328-802d-91780bc5964e", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4", "node_type": "4", "metadata": {}, "hash": "789f423aaa59148ae7466c9995a68374a1536300f0e2182e45f3794cf17cc305", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "66023780-9b49-46a9-9204-8a4786f537e9", "node_type": "1", "metadata": {}, "hash": "7a96efa56a5b9a4758de999a0a622f41e6bedab39d3d74d51d330870573b51ea", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "2ce891bb-b95c-457c-859e-7a8980eb98c2", "node_type": "1", "metadata": {}, "hash": "c035c0c23bb571381c50d07b55536da60311769e924ce720d987eb4cdd4757c0", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Given a testing dataset for a specific deployment, this\ninformation would be very valuable for the ecologists using the system.\n8\n\nFigure 2.2: Precision-recall curve for threshold analysis\n2.2. T HE TRAINING DATA\n2.2.1. The need for new training datasets\nAs mentioned before, the Faster R-CNN model was trained on a set of images on which bound-\ning boxes could accurately be inferred with classical computer vision techniques, due to the\nlow density of moths and clean background. A visual inspection of this dataset revealed that\nit is actually not from any of the current deployments 2, and that a different data acquisition\ntechnique was used, leading to way less sharp edges. This is a clear example of data-shift. In\nthe past, quick tests with a SSDlite model \u2014the lightest and faster architecture available on\ntorchvision\u2014, had failed. I hypothesized that one of the reasons could have been data-shift:\nwhile the relatively heavy Faster R-CNN with ResNet-50-FPN backbone can handle the shift, a\nsmaller model is expected to have less generalization capacity, and hence be more sensible to it.\nTo test this hypothesis, a SSDlite model was trained. The validation set was created by selecting\nimages after a certain time stamp, as opposed to a random split. A number of images were\nused as a gap between training and validation sets, and thus were unused. This ensured a\ncertain diversity between validation and training images, as consecutive images can be very\nsimilar to one another. The model was tested on the validation set and on new images, and\nthe predictions were inspected with the apposite GUI app. As shown in Fig.2.3, a significant\ndrop in performance was observed between validation images and target images, confirming\nthe initial hypothesis, and suggesting that a new training dataset was needed to facilitate the\nadoption of a light-weight model.\n2Or at least, not from any of the deployments from which images available to the team at Mila are.\n9\n\n(a) Old image\n (b) Recent image\nFigure 2.3: Performance comparison of a SSDlite model trained on old images. Reasonable\npredictions are given across a wide spectrum of thresholds on the old images (a), while absurd\npredictions are made on the new images (b). In the example, the largest square bounding box\nhas the highest confidence.\n2.2.2. Garbage in, garbage out\nFollowing the findings described in the previous section, a dataset of two thousand images\nfrom four different deployments (Quebec, Vermont, Newfoundland and Panama) was assem-\nbled. Bounding boxes were derived automatically using the Faster R-CNN model. However,\nan extensive inspection of the predictions revealed that the model was prone to two types of\nerrors: missed detections (i.e. false negatives), especially on smaller moths, and double detec-\ntions (i.e. bounding boxes that group multiple moths together). Additionally, the model would\noccasionally also make multiple predictions on the same large moth. Finally, the bounding\nboxes were often too loose around the insects, which makes them not ideal to train on (object\ndetection annotation best practices include drawing perfectly tight bounding boxes). These\nerrors, some of which are displayed in Fig.2.4, reflect the deficiencies of the model\u2019s training\ndata, where small moths are often not annotated, moths are seldom close to one another and\nbounding boxes are often loose.\nFigure 2.4: Common mistakes from the old Faster R-CNN model.\n10\n\nThe goal was no longer to make the object detector faster while maintaining the same level\nof accuracy: the accuracy needed to improve as well. Any model trained on the dataset\nwith Faster R-CNN\u2019s bounding boxes would replicate the same mistakes (as the saying goes,\n\"Garbage in, garbage out\"). Hence, the efforts to create a new training dataset were not over.\n2.2.3. Training data synthesis\nOne bold idea to create new training data was suggested by a teammate: use Meta\u2019s recently\nreleased Segment Anything Model (SAM, [24]) to crop insects, and paste the crops on empty\nbackground images; in other worlds, to synthesize the training dataset. An immediate concern\nwas that a model trained on such a dataset would learn to detect pasted objects instead of actual\ninsects, i.e. its performance would degrade on natural images. However, given the potential of\nthe idea, the gamble was taken. The pipeline consists of three steps: (i) run the inferences with\nSAM, (ii) manually filter the crops, (iii) and synthesize new images.", "mimetype": "text/plain", "start_char_idx": 18278, "end_char_idx": 22751, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "2ce891bb-b95c-457c-859e-7a8980eb98c2": {"__data__": {"id_": "2ce891bb-b95c-457c-859e-7a8980eb98c2", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4", "node_type": "4", "metadata": {}, "hash": "789f423aaa59148ae7466c9995a68374a1536300f0e2182e45f3794cf17cc305", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "3ed814ba-a0a4-4328-802d-91780bc5964e", "node_type": "1", "metadata": {}, "hash": "98d5fe81d5949a66b402ddc5726a67433de0bd58cc6d0306e573784fce5a2146", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "04a0e9e2-4399-4f8e-9bb7-4a92ca7f3dfa", "node_type": "1", "metadata": {}, "hash": "715286a1c9b9632ab53c0ff851e79b3cd169d3acbeb9126b8478315310dabbda", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Hence, the efforts to create a new training dataset were not over.\n2.2.3. Training data synthesis\nOne bold idea to create new training data was suggested by a teammate: use Meta\u2019s recently\nreleased Segment Anything Model (SAM, [24]) to crop insects, and paste the crops on empty\nbackground images; in other worlds, to synthesize the training dataset. An immediate concern\nwas that a model trained on such a dataset would learn to detect pasted objects instead of actual\ninsects, i.e. its performance would degrade on natural images. However, given the potential of\nthe idea, the gamble was taken. The pipeline consists of three steps: (i) run the inferences with\nSAM, (ii) manually filter the crops, (iii) and synthesize new images.\nInferences with SAM\nA python command-line script was developed to run the SAM model with custom parameters\non the desired images. The script can be used to generate bounding boxes, which are saved\nin a json file, and crops of detected objects. In the first case, SAM is used as an object detec-\ntor; unfortunately, while the predicted bounding boxes are perfectly tight, there are too many\nmistakes to consider SAM as a drop-in replacement of Faster R-CNN for the annotation of new\ndatasets. Common errors include missed detections of small moths, and wings that are con-\nsidered as separate objects; tweaking SAM\u2019s parameters to improve on one problem tended to\nmake the other worse. For the following tasks, it was important not to miss the small moths, in\norder not to bias the final dataset towards larger moths. Hence, the parameters were tweaked\naccordingly.\nA diverse collection of nearly 300 images was assembled from five different locations (Den-\nmark, Vermont, Quebec, Panama and Newfoundland). The IDs 3 of the selected images were\nstored for reference. The image processing produced almost 4k crops. Each crop of a detected\nobject consists of two arrays: one array is a crop of the image, the other is a boolean mask indi-\ncating the object inside the crop. Each array is saved in the npy format, NumPy\u2019s standard for\npersisting a single arbitrary NumPy array on disk. The .npy files are zipped in a .npz file, with\na naming convention that allows to pair crops and their corresponding mask together when\nreading the file later on.\nManual review of the segmented objects\nAs stated above, SAM\u2019s predictions are not perfect. Hence, there is still need for manual review.\nLuckily, with the right tools, manual review is very fast: critically, it ismuch faster than drawing\nbounding boxes. This is because the only required operation is to delete wrong detections. All\ndetections are perfectly tight around the object, so there is no need to adjust the bounding\nboxes. The tool developed for this task is a GUI application, presented in Fig.2.5. The user\ncan swipe through the crops read from the .npz file with the left and right arrow keys. Bad\ncrops can be deleted in a split second by pressing the delete key. When the first crop is deleted,\nthe \"Save as\" and \"Discard\" buttons are activated, such that changes can be saved (under the\ndesired filename) or discarded at any time. If the window is closed with unsaved changes, a\n3Regrettably, there is actually no guarantee of uniqueness of the image filename across the whole project.\n11\n\ndialog box appears to ask whether to save the changes or not. The user can also play with the\noverlay that appears on the segmented object, by making it more or less transparent.\nFigure 2.5: GUI app to review SAM\u2019s crops.\nImage synthesis\nThe command line script developed to synthesize the new images has four main inputs: the\n.npz file with the crops, the path to a folder with background images, the number of new im-\nages to create from each background, and the number of crops to paste on each image. For each\nnew image, moths are pasted one after the other at random locations. Collisions are taken care\nof: if a moth overlaps with one of the already pasted moths by more than a certain threshold 4,\na new random position is selected. Similarly, moths are not allowed to be out of the image by\nmore than a certain portion. Two types of simple augmentations are used: rotations (90\u25e6, 180\u25e6,\n270\u25e6) and flips (horizontal and vertical).", "mimetype": "text/plain", "start_char_idx": 22019, "end_char_idx": 26239, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "04a0e9e2-4399-4f8e-9bb7-4a92ca7f3dfa": {"__data__": {"id_": "04a0e9e2-4399-4f8e-9bb7-4a92ca7f3dfa", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4", "node_type": "4", "metadata": {}, "hash": "789f423aaa59148ae7466c9995a68374a1536300f0e2182e45f3794cf17cc305", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "2ce891bb-b95c-457c-859e-7a8980eb98c2", "node_type": "1", "metadata": {}, "hash": "c035c0c23bb571381c50d07b55536da60311769e924ce720d987eb4cdd4757c0", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "39a9562c-9040-4276-b6a3-0686a44bcf50", "node_type": "1", "metadata": {}, "hash": "268120bcdfd894cbb86409e273db453391b849f2da61f4de300271e6ff8a9d38", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Figure 2.5: GUI app to review SAM\u2019s crops.\nImage synthesis\nThe command line script developed to synthesize the new images has four main inputs: the\n.npz file with the crops, the path to a folder with background images, the number of new im-\nages to create from each background, and the number of crops to paste on each image. For each\nnew image, moths are pasted one after the other at random locations. Collisions are taken care\nof: if a moth overlaps with one of the already pasted moths by more than a certain threshold 4,\na new random position is selected. Similarly, moths are not allowed to be out of the image by\nmore than a certain portion. Two types of simple augmentations are used: rotations (90\u25e6, 180\u25e6,\n270\u25e6) and flips (horizontal and vertical). To maximize the diversity of the augmented crops,\nthe transformations are applied cyclically: first, the moths are pasted without augmentations;\nwhen all the moths in the collection have been pasted once, the algorithm cycles again through\nthe collection, and a first set of augmentations is applied; at the second cycle, another set is\nchosen, and so on. A synthetic image is displayed in Fig2.6 as an example.\nWith the GUI described before, a collection of 2600 clean crops was created, during a single day\nof work. From these, using a collection of more than a hundred empty background images, a\ndataset of 5k images with was synthesized. For comparison, manual annotation of that many\nimages is expected to take at least a week. Most importantly, it appeared that the bet paid\noff: models trained on synthetic data performed well on natural data \u2013better than the previ-\nous model. The results will be presented more in details in the corresponding section. Lastly,\nit is worth noting one advantage of this technique: whenever a new deployment is created,\n4in terms of overlapping surface over moth area\n12\n\ncorresponding background images can be used with the existing collection of segmented in-\nsects to update the training dataset; this way, models can quickly be made familiar with new\ndeployments backgrounds, which might be beneficial for the models\u2019 accuracy.\nFigure 2.6: A synthetic image. In the last version of the synthetic dataset, moth density was\nincreased from twenty to thirty moths per image.\n2.3. M ODEL ARCHITECTURE AND TRAINING RECIPE\nSSDlite\nTo improve the speed of the model, changes in the architecture are necessary. In torchvision,\nmany object-detection models are available off-the-shelf. A very informative table gives an\noverview of the models, with their performance on the standard COCO benchmark dataset,\ntheir size, and a link to the training recipe. As stated before, the fastest available model \u2014by\nfar\u2014- is SSDlite, an adaptation of the Single Shot Detector [25] which was first briefly intro-\nduced on the MobileNetV2 paper [26] and later reused on the MobileNetV3 paper [27]. Initially,\nefforts were pursued to obtain a SSDlite model with good performance. To do so, following the\nindicated training recipe, some functionalities were added to the training script: the cosine an-\nnealing with warm-up epochs learning rate schedule (see Fig. 2.7), and the random crops and\nrandom horizontal flips data augmentations. While each of these delivered improvements, the\nmodel performance \u2013as measured on the validation sets and logged to W&B for each train-\ning run\u2013 remained unsatisfactory. More precisely, the models were too sensitive to the score\nthreshold. When the synthetic datasets were introduced, the performance on the (synthetic)\nvalidation sets dropped. In part, this was revealed to be due to the decrease in bounding box\nsize (caused by the tight fit), which is shown in Fig.2.8. Still, the idea to use SSDlite as imple-\nmented in torchvision was abandoned. In hindsight, this should have been done much before;\nSSDlite internally resizes the images to 320x320 pixels, so it is only normal that it performs\npoorly on such small objects.\n13\n\nFigure 2.7: Learning rate displayed for various training runs on the W&B dashboard. The\nschedule is cosine annealing, with varying numbers of warm-up epochs.\nFigure 2.8: Bounding box size distribution for the natural and synthetic datasets.\nFaster R-CNN and Retina-Net\nTo test the newly created synthetic dataset, the same architecture of the model to beat was\nemployed.", "mimetype": "text/plain", "start_char_idx": 25482, "end_char_idx": 29800, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "39a9562c-9040-4276-b6a3-0686a44bcf50": {"__data__": {"id_": "39a9562c-9040-4276-b6a3-0686a44bcf50", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4", "node_type": "4", "metadata": {}, "hash": "789f423aaa59148ae7466c9995a68374a1536300f0e2182e45f3794cf17cc305", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "04a0e9e2-4399-4f8e-9bb7-4a92ca7f3dfa", "node_type": "1", "metadata": {}, "hash": "715286a1c9b9632ab53c0ff851e79b3cd169d3acbeb9126b8478315310dabbda", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "fe41da69-d1ee-4ce6-8888-34e9c9a759b5", "node_type": "1", "metadata": {}, "hash": "60fecee0ddb5e5251efd1b46acbc20d4ad642d9874f309b3dce95e3e88ada004", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "In part, this was revealed to be due to the decrease in bounding box\nsize (caused by the tight fit), which is shown in Fig.2.8. Still, the idea to use SSDlite as imple-\nmented in torchvision was abandoned. In hindsight, this should have been done much before;\nSSDlite internally resizes the images to 320x320 pixels, so it is only normal that it performs\npoorly on such small objects.\n13\n\nFigure 2.7: Learning rate displayed for various training runs on the W&B dashboard. The\nschedule is cosine annealing, with varying numbers of warm-up epochs.\nFigure 2.8: Bounding box size distribution for the natural and synthetic datasets.\nFaster R-CNN and Retina-Net\nTo test the newly created synthetic dataset, the same architecture of the model to beat was\nemployed. The new Faster R-CNN with ResNet-50-FPN did great, both in terms of mAP on\nthe (synthetic) evaluation set, and, upon visual inspection, on natural images. Mimicking the\ntraining recipe indicated in torchvision, pre-trained weights for the backbone and a multi-step\nlearning rate schedule were adopted, with no augmentations other than the horizontal flip. As\nthe distribution of bounding box sizes displayed in Fig.2.8 is relatively narrow, the number\nof anchors \u2014a parameter that is not readily accessible\u2014 was reduced. This was expected to\nimprove the model speed without affecting its accuracy, but in fact it affected neither of those.\nIn hindsight, perhaps the reason is that, with such high resolution images (4096x2160 pixels),\n14\n\nmost of the inference time is taken to compute the feature map. While the performance goals\nhad been achieved, the model was as slow as ever.\nExperimentation with RetinaNet-v2 [28], a much more modern architecture compared to Faster\nR-CNN, was disappointing as there were no improvements neither in speed nor in accuracy.\nThere was good reason to believe that the single-stage model would be faster, but it didn\u2019t\nprove so. Again, this could be due to the high resolution of the images, which make the\nbackbone the main bottleneck. The accuracy was measured on the synthetic evaluation set.\nHowever, the assumption that small differences in accuracy on synthetic images proportion-\nally translate to natural images is yet to be verified. Further efforts to improve the accuracy of\nthe model should be delayed until proper test sets are available.\nFinally, the Faster R-CNN with MobileNetV3-Large-FPN backbone was tested, with great suc-\ncess. On CPU, the model was 6 times fasterthan its equivalent with heavier backbone. Again,\nusing the performance on synthetic data as a proxy for the performance on natural data, hyper-\nparameters such as the number of trainable backbone layers and the learning rate were tweaked\nto optimal values. An attempt to match this model accuracy with a custom RetinaNet-v2 with\nMobileNetV3-Large-FPN backbone \u2014a pairing that is not available off-the-shelf\u2014 proved un-\nsuccessful.\n2.4. R ESULTS\nThe goals of this work were to improve the object detector both in terms of speed and accuracy.\nWhile also creating a good basis for further development, these goals were achieved with the\ncreation (and deployment) of two Faster R-CNN models: a slow model (with ResNet-50-FPN\nbackbone) and a fast model (with MobileNetV3-Large-FPN). In the following pages, a visual\ncomparison between the performance of the old and new models is presented. To ensure fair-\nness, a representative subsample of the differences viewed during image inspection is shown.\nA constant threshold was kept for each model. Each figure displays three images: the first\ncorresponds to the old model, the second to the new slow model, and the third to the new fast\nmodel. When only two images are presented, the second image counts for both new models\n(predictions were identical).\nFigure 2.9\n15\n\nFigure 2.10\nFigure 2.11\nFigure 2.12\nFigure 2.13\n16\n\nFigure 2.14\nFigure 2.15\nFigure 2.16\nFigure 2.17\n17\n\nFigure 2.18\nFrom visual inspection of the models\u2019 predictions on natural images, it seems safe to say that\nboth are better than the previous model in terms of accuracy, largely thanks to the new training\ndataset.", "mimetype": "text/plain", "start_char_idx": 29041, "end_char_idx": 33148, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "fe41da69-d1ee-4ce6-8888-34e9c9a759b5": {"__data__": {"id_": "fe41da69-d1ee-4ce6-8888-34e9c9a759b5", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4", "node_type": "4", "metadata": {}, "hash": "789f423aaa59148ae7466c9995a68374a1536300f0e2182e45f3794cf17cc305", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "39a9562c-9040-4276-b6a3-0686a44bcf50", "node_type": "1", "metadata": {}, "hash": "268120bcdfd894cbb86409e273db453391b849f2da61f4de300271e6ff8a9d38", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "5e20f7b6-ac06-4a68-913b-419a2b585f5d", "node_type": "1", "metadata": {}, "hash": "868a157fa8067659291506e48c4406d0828679e3737a948fe8f8ed1374646230", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "To ensure fair-\nness, a representative subsample of the differences viewed during image inspection is shown.\nA constant threshold was kept for each model. Each figure displays three images: the first\ncorresponds to the old model, the second to the new slow model, and the third to the new fast\nmodel. When only two images are presented, the second image counts for both new models\n(predictions were identical).\nFigure 2.9\n15\n\nFigure 2.10\nFigure 2.11\nFigure 2.12\nFigure 2.13\n16\n\nFigure 2.14\nFigure 2.15\nFigure 2.16\nFigure 2.17\n17\n\nFigure 2.18\nFrom visual inspection of the models\u2019 predictions on natural images, it seems safe to say that\nboth are better than the previous model in terms of accuracy, largely thanks to the new training\ndataset. The following observations can be made about the new models:\n(i) they are better at detecting small moths (Fig.2.13, 2.14, 2.15, 2.16, 2.17, 2.18);\n(ii) they are better at separating overlapping moths, although not perfect (Fig.2.9, 2.12);\n(iii) they don\u2019t get fooled by artifacts on the screen (Fig.2.11, 2.15, 2.16, 2.18;\n(iv) they seem more resistant to noisy images (e.g. moths flying close to the camera, Fig.2.10,\n2.14);\n(v) by design, they tend not to detect mosquitos and flies, as those were present on the back-\nground images used for the synthetic dataset, and were filtered out of the segmented\ncrops (Fig.2.18);\n(vi) by design, they predict tight bounding boxes;\n(vii) the slow model seems to have an edge over the fast mode (Fig.2.9 and 2.11, which is why\nit was also released. Users can select the desired model based on their time constraints.\nThe code for the object detection module is available on the ami-ml public GitHub repository.\nHopefully, the code for the other modules will soon be merged on the repo as well.\n18\n\nChapter 3\nActive Learning\nThe final step of the pipeline presented in Fig.1.2 is the moth species classification. As men-\ntioned in the introduction, since there is no diverse and extensive labeled image dataset from\nmoth traps, and given that making one from scratch is not feasible with the current time and\nbudget constraints, the training dataset was constructed with GBIF. For a single region, there\nare usually multiple thousands relevant species of moths; GBIF has been fundamental to build\na classifier able to identify as many classes, making it possible to create training datasets of\nmany hundred thousand images. Unfortunately, there is a significant data shift between train-\ning images and images from moth traps.\nTypical images from GBIF and from moth traps (following object detection) are presented in\nFig.3.1 and Fig.3.2, respectively. In the trap images, the background is uniform, the camera an-\ngle is fixed, and the moths fill the frame. Additionally, these images are low resolution (around\n120 pixels per side, on average). In contrast, GBIF images are mostly high resolution. The\nbackground is heterogeneous, with moths sometimes blending into it; moths are pictured from\nvarying perspectives and often only occupy a small portion of the image. While strong data\naugmentation operations have been helpful in mitigating the data shift, there is still a drop in\nspecies classification accuracy of about 10% from GBIF test images to trap images (a very small\nset of such images was labeled by experts). One way to tackle the problem would be to in-\ncorporate trap images to the training set. Even a relatively small number of these images (e.g.\n5% of the training set) is expected to significantly reduce the accuracy drop, and with more\nand more entomologists becoming interested in the project, the necessary work force to label\nthe images could become available. In this context, active learning appears as a very attractive\nfield of research.\nActive learning (AL) encompasses all the techniques that aim to maximize the performance\nimprovement of a model upon labeling of new samples and consequent addition to the train-\ning dataset, by selecting the most valuable samples. Hence, it is about making the most of\nthe human annotators\u2019 work. The typical workflow is depicted in algorithm 1. In the past\ndecade, with the emergence of data-hungry deep learning, AL has attracted renewed interest,\nand plenty of techniques have been developed.", "mimetype": "text/plain", "start_char_idx": 32406, "end_char_idx": 36655, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "5e20f7b6-ac06-4a68-913b-419a2b585f5d": {"__data__": {"id_": "5e20f7b6-ac06-4a68-913b-419a2b585f5d", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4", "node_type": "4", "metadata": {}, "hash": "789f423aaa59148ae7466c9995a68374a1536300f0e2182e45f3794cf17cc305", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "fe41da69-d1ee-4ce6-8888-34e9c9a759b5", "node_type": "1", "metadata": {}, "hash": "60fecee0ddb5e5251efd1b46acbc20d4ad642d9874f309b3dce95e3e88ada004", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "c0f8c088-1ff8-4839-a325-2522f014510a", "node_type": "1", "metadata": {}, "hash": "648ee93f9c6cacd2f660539d45a3aff80bf95f0905619546b90aae7a155866fa", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Even a relatively small number of these images (e.g.\n5% of the training set) is expected to significantly reduce the accuracy drop, and with more\nand more entomologists becoming interested in the project, the necessary work force to label\nthe images could become available. In this context, active learning appears as a very attractive\nfield of research.\nActive learning (AL) encompasses all the techniques that aim to maximize the performance\nimprovement of a model upon labeling of new samples and consequent addition to the train-\ning dataset, by selecting the most valuable samples. Hence, it is about making the most of\nthe human annotators\u2019 work. The typical workflow is depicted in algorithm 1. In the past\ndecade, with the emergence of data-hungry deep learning, AL has attracted renewed interest,\nand plenty of techniques have been developed. There are two main categories: uncertainty-\nbased techniques and diversity-based techniques. The previous try to find images the model\nisn\u2019t certain about, as a proxy of the model being wrong about the image. The latter try to find\nimages that best represent the diversity existing in the pool of unlabeled images (e.g. [29]).\nFinally, there are also techniques that combine both approaches. For an initial exploration of\nthe topic under strict time constraints, precedence was given to the more easily implementable\nand scalable uncertainty-based techniques.\n19\n\n(a) Eudeilinia herminiata\n (b) Acronicta insita\n (c) Thysania zenobia\n(d) Syngrapha rectangula\n (e) Cosmia calami\nFigure 3.1: Images from GBIF\nFigure 3.2: Images from moth traps, following object detection\n20\n\nAlgorithm 1The pool-based active learning workflow, which is most common in deep learning.\nFrom [30]\nInput :Initial training dataset L, unlabeled data pool U, annotators A\nOutput: A well-trained model M with least labeling cost\n1: while End condition isn\u2019t met do\n2: Train the model M with L\n3: Obtain the representation R of all samples x \u2208 U, R = M(x)\n4: Query the top-K informative samples K via selection strategies, based on R\n5: Annotate the samples K and obtain the labels YK = A(K)\n6: Update L = L \u222a {K, YK}, update U = U/K\n7: end while\n3.1. M EASURES OF UNCERTAINTY\nNeural networks predictions are notoriously unreliable when the input sample is out of the\ntraining distribution or corrupted by noise. To effectively apply active learning techniques in\ndeep learning frameworks, better estimations of uncertainty are needed. Predictive uncertainty,\ni.e. the uncertainty related to the prediction \u02c6y(x\u2217) for a concrete query instance x\u2217, can be de-\ncomposed into two distinct types of uncertainty: aleatoric uncertaintyand epistemic uncertainty.\nGenerally, the previous refers to the notion of randomness, that is, the variability in the out-\ncome of an experiment which is due to inherently random effects. In the context of image\nclassification, aleatoric uncertainty relates to the inherent difficulty of classifying an image (e.g.\nif the image is blurred or occluded). Epistemic uncertainty refers to uncertainty caused by a\nlack of knowledge, i.e., to the epistemic state of the agent (e.g. a deep learning model). As op-\nposed to aleatoric uncertainty, epistemic uncertainty can in principle be reduced on the basis of\nadditional information. This should precisely be the goal of uncertainty-based active learning\ntechniques [31]. In other words, the goal is to find images that the model should be able to\nclassify, but can\u2019t.\nA classic deep learning model typically provides an estimate of aleatoric uncertainty (which,\nas stated above, can be unreliable). Individually, such a model can\u2019t be used to estimate epis-\ntemic uncertainty. Instead, it can be done if multiple models are available, by looking at model\ndisagreement. Groups of models are called ensembles, and they are commonly obtained by\nrunning multiple trainings: each training is a random process that results in (slightly) different\nmodel parameters. Intuitively, if two models disagree on a sample, at least one of them must\nbe wrong, while the other might be right, suggesting that it is possible to classify the sample.\nAnother way to look at it is that models are more likely to disagree on samples that are out-\nof-distribution, because in these \"regions\" the decision boundary and the feature space itself\nare more likely to evolve differently at training.", "mimetype": "text/plain", "start_char_idx": 35804, "end_char_idx": 40174, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "c0f8c088-1ff8-4839-a325-2522f014510a": {"__data__": {"id_": "c0f8c088-1ff8-4839-a325-2522f014510a", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4", "node_type": "4", "metadata": {}, "hash": "789f423aaa59148ae7466c9995a68374a1536300f0e2182e45f3794cf17cc305", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "5e20f7b6-ac06-4a68-913b-419a2b585f5d", "node_type": "1", "metadata": {}, "hash": "868a157fa8067659291506e48c4406d0828679e3737a948fe8f8ed1374646230", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "e29d9346-c846-4203-8c35-b8bb1e8fa481", "node_type": "1", "metadata": {}, "hash": "327e666629b2b2d724bf1be4145cc85a20fb03ba7b90e1efad4ef19d14b50da2", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "A classic deep learning model typically provides an estimate of aleatoric uncertainty (which,\nas stated above, can be unreliable). Individually, such a model can\u2019t be used to estimate epis-\ntemic uncertainty. Instead, it can be done if multiple models are available, by looking at model\ndisagreement. Groups of models are called ensembles, and they are commonly obtained by\nrunning multiple trainings: each training is a random process that results in (slightly) different\nmodel parameters. Intuitively, if two models disagree on a sample, at least one of them must\nbe wrong, while the other might be right, suggesting that it is possible to classify the sample.\nAnother way to look at it is that models are more likely to disagree on samples that are out-\nof-distribution, because in these \"regions\" the decision boundary and the feature space itself\nare more likely to evolve differently at training. A common estimator for epistemic uncertainty\nin the context of ensembles is the Jensen-Shannon Divergence, most frequently called Mutual\nInformation. Given the vectors of predicted probabilities p(e) over the set of classes K for each\nmodel e in the ensemble E, the average probability vector p is computed. The entropy of the\naverage probability vector can be used as a measure of predictive uncertainty:\nH(p) = p\u22a4 log(p) (3.1)\nIt can be shown that the entropy is upper bounder by the cardinality of the random variable,\n21\n\ni.e. H(p) \u2264 log(|K|). The mutual information is defined as:\nJ(p) = H(p) \u2212 1\nE \u2211\ne\u2208E\nH(p(e)) (3.2)\nSince entropy is always positive, the maximum possible value forJ(p) is H(p). However, when\nthe models make similar predictions, 1\nE \u2211e\u2208E H(p(e)) \u2192 H(p), thus J(p) \u2192 0, which is its mini-\nmum value. This shows that mutual information is higher for samples with high disagreement.\nAn alternative way to look at the formula is that from the predictive uncertainty, we subtract\naway the expected aleatoric uncertainty, leaving a measure of the epistemic uncertainty.\nIn the context of active learning, entropy and mutual information are popular scoring functions,\nalso called acquisition functions. These functions are used to evaluate the unlabeled pool of\nsamples. The highest the score, the highest is the expected value of the sample. Other common\nscoring methods are least confidence, margin sampling and variation ratios. Least confidence is\nto select the sample with the smallest probability of the top1 predicted class. Margin sampling\nis to calculate the difference between the probabilities of the top-1 and the top-2 predicted\nclass. These two scoring functions are conceptually similar to the entropy. In contrast, similar\nto mutual entropy, variation ratios also looks for disagreement between the models, and it is\ndefined as the fraction of members in the ensemble that do not agree with the majority vote. It is\nworth noting that this function has the undesirable property of only returning a finite number\nof values, related to the number of models in the ensemble |E| and the number of classes |K|.\nAll the implemented scoring functions are summarized in Table 3.1.\nTable 3.1: Summary of typical scoring functions in uncertainty-based active learning\nLeast confidence LC(p) = top1k\u2208 K(1 \u2212 pk)\nMargin sampling MS(p) =1 \u2212 (top1k\u2208K(pk) \u2212 top2i\u2208K(pk))\nEntropy H(p) = p\u22a4 log(p)\nMutual information J(p) = H(p) \u2212 1\nE \u2211e\u2208E H(p(e))\nVariation ratios V(p) =1 \u2212 1\nE \u2211e\u2208E (arg maxk\u2208K p(e)\nk = M)\nwhere M = modee\u2208E(arg maxk\u2208K p(e)\nk )\n3.2. E NSEMBLE CONFIGURATIONS\nAs mentioned above, ensembles are commonly obtained by running multiple trainings (with\ndifferent random seeds), usually in the 5-10 range [32] (although theoretically, the larger the\nensemble, the better the uncertainty estimation). As the additional computational cost can be a\nlimitation, many techniques have been proposed to derive ensembles without additional train-\ning runs. A well known example is Monte-Carlo drop-out [33], where drop-out is applied at\ntest-time and multiple inferences are run for each sample. However, it has been shown that\n22\n\nMC-dropout suffers from mode collapse, i.e.", "mimetype": "text/plain", "start_char_idx": 39272, "end_char_idx": 43366, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "e29d9346-c846-4203-8c35-b8bb1e8fa481": {"__data__": {"id_": "e29d9346-c846-4203-8c35-b8bb1e8fa481", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4", "node_type": "4", "metadata": {}, "hash": "789f423aaa59148ae7466c9995a68374a1536300f0e2182e45f3794cf17cc305", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "c0f8c088-1ff8-4839-a325-2522f014510a", "node_type": "1", "metadata": {}, "hash": "648ee93f9c6cacd2f660539d45a3aff80bf95f0905619546b90aae7a155866fa", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "004767a1-85b2-466e-b117-b264a4667205", "node_type": "1", "metadata": {}, "hash": "2b3012f90b8a4edc474732a1f36a739f01eafebfbfb08200c5031e86bb15b44e", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "E NSEMBLE CONFIGURATIONS\nAs mentioned above, ensembles are commonly obtained by running multiple trainings (with\ndifferent random seeds), usually in the 5-10 range [32] (although theoretically, the larger the\nensemble, the better the uncertainty estimation). As the additional computational cost can be a\nlimitation, many techniques have been proposed to derive ensembles without additional train-\ning runs. A well known example is Monte-Carlo drop-out [33], where drop-out is applied at\ntest-time and multiple inferences are run for each sample. However, it has been shown that\n22\n\nMC-dropout suffers from mode collapse, i.e. it can lead to a very imbalanced dataset by fa-\nvoring a specific class during the active learning process [34, 35]. In contrast, in the reported\nstudies, ensembles are able to counteract the imbalance. An interesting approach to derive en-\nsembles with no extra cost is given in [36]. In this work, the disagreement between checkpoints\nstored during successive training epochs (due to the catastrophic forgetting property of DNNs)\nis exploited to efficiently construct large and diverse ensembles. While the high computational\ncost associated to typical ensembles doesn\u2019t seem to be an issue at MILA, the encouraging\nresults obtained in [36] motivate exploring different configurations. The configurations are\nsummarized in Table 3.2.\nTable 3.2: Summary of the ensemble configurations considered in this study\nName Description\nsingle A single model is used.\n5best An ensemble of five models, each from a different training run. The best\ncheckpoint is used for each run.\n20last An ensemble of twenty models, obtained at the 20 last epochs of a training run.\n5ckpt An ensemble of five models, obtained from epochs N, N \u2212 5, . . .N \u2212 20 of a single\ntraining run, where N is the last epoch. This is a subset of 20last.\n3.3. M ETHODS\nThe standard procedure to evaluate active learning techniques follows the typical AL work-\nflow, as described in Algorithm 1, with the difference that labels are usually already available\nfor all the data used in the experiment. Initially, the whole dataset is split into three: an initial\ntraining dataset L, a large pool of data U, and a left-out dataset T for testing. The model M\n(or the ensemble E) is trained on L, evaluated on T, and used to score the pool of data U with\nthe desired function. A number of samples is selected from U accordingly, and added to L. M\n(or E) is retrained on the updated L, evaluated on T, and so on. The performance measured\nat each iteration is usually plotted as a function of the number of training samples. Hence, the\nresult of the experiment is a plot with several curves, each corresponding to one acquisition\nfunction. Random selection is used as a baseline. With effective active learning techniques, the\nmodel is able to closely approach or even reach the performance of a model trained on the full\ndataset, with only a fraction of the data. The size of the initial training dataset and the number\nof samples added at each iteration are the main hyperparameters of the experiment.\nWhile the initial intended use for active learning was to select images from the moth traps, it\nbecame evident that ongoing efforts to label the images would not deliver enough data within\nthe time at my disposal. However, it was still deemed valuable to develop the active learn-\ning framework and to test it on GBIF images. In fact, as the standard evaluation procedure\nsuggests, besides effectively augmenting training datasets, active learning can also be used to\neffectively reduce training datasets: this operation is called \"training data subset search\". In\n[36], using only half of the data, an interesting trick is used to achieve better performance than\n23\n\nthe model trained on the full dataset, which is highly-imbalanced: at each active learning it-\neration, scores are also computed for samples that are already in the training dataset; these\ncan eventually be selected again. At the last iteration, the vast majority of the selected subset is\nsamples with multiple copies. The number of unique images is less than half of those in the full\ntraining dataset, yet the model trained on the selected subset outperforms the model trained on\nthe full dataset. It is noted that the effectiveness of the approach is due to its ability to counter\nthe class imbalance. While this experiment was performed in the context of object detection,\nthis approach seems very relevant for our classification problem, as we also face severe class\nimbalance. The exploration of this method is left as a future research direction.", "mimetype": "text/plain", "start_char_idx": 42740, "end_char_idx": 47345, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "004767a1-85b2-466e-b117-b264a4667205": {"__data__": {"id_": "004767a1-85b2-466e-b117-b264a4667205", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4", "node_type": "4", "metadata": {}, "hash": "789f423aaa59148ae7466c9995a68374a1536300f0e2182e45f3794cf17cc305", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "e29d9346-c846-4203-8c35-b8bb1e8fa481", "node_type": "1", "metadata": {}, "hash": "327e666629b2b2d724bf1be4145cc85a20fb03ba7b90e1efad4ef19d14b50da2", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "bd3c9a8c-f95b-478e-a4a2-01ed7f774d64", "node_type": "1", "metadata": {}, "hash": "85e16b8a0e8695f02bde804b18582770dafff05984046e6af78a6d3018faffc9", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "In\n[36], using only half of the data, an interesting trick is used to achieve better performance than\n23\n\nthe model trained on the full dataset, which is highly-imbalanced: at each active learning it-\neration, scores are also computed for samples that are already in the training dataset; these\ncan eventually be selected again. At the last iteration, the vast majority of the selected subset is\nsamples with multiple copies. The number of unique images is less than half of those in the full\ntraining dataset, yet the model trained on the selected subset outperforms the model trained on\nthe full dataset. It is noted that the effectiveness of the approach is due to its ability to counter\nthe class imbalance. While this experiment was performed in the context of object detection,\nthis approach seems very relevant for our classification problem, as we also face severe class\nimbalance. The exploration of this method is left as a future research direction.\nThe dataset used for the experiment consists of around 600k GBIF images gathered for the\nmoth species classifier deployed in Quebec and Vermont (the two neighboring states share\nthe same list of 3150 relevant species). 20% is allocated for the initial training dataset L, 20%\nis left-out for testing, and the remaining 60% constitutes the pool of data U from which to\nselect new samples. Due to the time constraints, only the first iteration of a typical active\nlearning evaluation is performed. The ensemble configurations listed in Table 3.2 are created by\nlaunching five trainings onL with different random seeds, and saving the relevant checkpoints.\nThe model is a standard ResNet-50, and the training recipe (learning rate schedule, weight\ndecay, batch size, data augmentations...) is the same as that of the deployed model, only the\nnumber of epochs being adjusted given the reduction in the training dataset size. A python\nscript was developed to score GBIF images with any given scoring function listed in Table 3.1,\ngiven an arbitrary list of ResNet-50 models. With that, the score of every image inU is obtained\nusing each ensemble configuration and scoring function.\n3.4. R ESULTS\n3.4.1. Scores distributions\nThe analysis of the scores gives several insights, and it is also an opportunity to perform sanity\nchecks. First, we can look at scores distributions. The mutual information distributions from\n20last and 5best is given in Fig.3.3a. As expected, both are long-tailed distributions. This is\nmore pronounced for 5best, suggesting that this ensemble is more diverse. The distribution for\n5ckpt, not shown for clarity, is slightly closer to zero than 20last. Given that mutual informa-\ntion\u2019s maximal value corresponds to the entropy\u2019s maximal value, which is log2(3150) = 11.6,\none can wonder if even the 5best ensemble is diverse enough. Perhaps a different split between\ntraining and validation datasets across the five training runs would have been beneficial.\nThe entropy distributions from single and 5best is displayed in Fig.3.3b. The two distributions\nare very similar, except that the distribution fromsingle has a 40% higher peak near zero, mean-\ning that the single model has given much more predictions with very high confidence. Unless\nthese predictions are correct, this seems to confirm the idea that a single model\u2019s estimation of\nuncertainty is unreliable.\nThe margin sampling distribution from 20last is presented in Fig.3.3c, as a good representation\nof the distributions from all other ensemble configurations. Surprisingly, there are two peaks:\none at the minimum and one at the maximum. This suggests that for many images, it is hard to\ndistinguish between two classes. Finally, the variation ratios distribution is shown in Fig.3.3d,\nfor illustrative purposes. The least confidence distribution (which is also long-tailed, although\nless so than the entropy\u2019s) is omitted for brevity.\n24\n\n(a)\n (b)\n(c)\n (d)\nFigure 3.3: Distribution of different scores with various ensemble configurations.\n3.4.2. Correlation between scores\nAnother point of interest is the correlation between scores. To some extent, this can be dis-\nplayed in 2D histograms. In Fig.3.4, joint distributions of different scores from the 5best en-\nsemble configuration are displayed. While the vast majority of samples is concentrated at low\nscores, for the higher scores least confidence and entropy appear significantly more correlated\nthan entropy and mutual information; this is unsurprising, given that least confidence and en-\ntropy are conceptually similar, while entropy and mutual information are fundamentally differ-\nent.", "mimetype": "text/plain", "start_char_idx": 46385, "end_char_idx": 50982, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "bd3c9a8c-f95b-478e-a4a2-01ed7f774d64": {"__data__": {"id_": "bd3c9a8c-f95b-478e-a4a2-01ed7f774d64", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4", "node_type": "4", "metadata": {}, "hash": "789f423aaa59148ae7466c9995a68374a1536300f0e2182e45f3794cf17cc305", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "004767a1-85b2-466e-b117-b264a4667205", "node_type": "1", "metadata": {}, "hash": "2b3012f90b8a4edc474732a1f36a739f01eafebfbfb08200c5031e86bb15b44e", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "5084cc68-c3d7-4136-83c1-b8c48955719a", "node_type": "1", "metadata": {}, "hash": "d9840d158c2be51dc769a2c15c70a2a36e78b43ca9f8a486a89745767e4b879e", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "The least confidence distribution (which is also long-tailed, although\nless so than the entropy\u2019s) is omitted for brevity.\n24\n\n(a)\n (b)\n(c)\n (d)\nFigure 3.3: Distribution of different scores with various ensemble configurations.\n3.4.2. Correlation between scores\nAnother point of interest is the correlation between scores. To some extent, this can be dis-\nplayed in 2D histograms. In Fig.3.4, joint distributions of different scores from the 5best en-\nsemble configuration are displayed. While the vast majority of samples is concentrated at low\nscores, for the higher scores least confidence and entropy appear significantly more correlated\nthan entropy and mutual information; this is unsurprising, given that least confidence and en-\ntropy are conceptually similar, while entropy and mutual information are fundamentally differ-\nent. The other two ensemble configurations, 20last and 5ckpt, give very similar plots. In Fig.3.5,\njoint distributions of the same score from different ensemble configurations are presented. As\nexpected, 20last and 5ckpt present high correlations, both for entropy and mutual information.\nIn contrast, 20last and 5best present lower correlation, especially for mutual information, sug-\ngesting that 20last might not be as good as5best (which is the standard ensemble configuration).\nFinally, for each scoring function and ensemble configuration, the images in U are ranked,\nand the top-K are selected to be added to L. K is chosen such that K = | U|/9 = | L|/3. For\nsimplicity, only the entropy and mutual information scoring functions are discussed for the\nrest of the experiment. As a sanity check, the overlap between the newly selected datasets is\npresented in Table3.3 in terms of Intersection over Union (IoU).\n25\n\n(a) \u03c1 = 0.99\n (b) \u03c1 = 0.94\nFigure 3.4: 2D histograms displaying the correlation between different scores, from the same\nensemble configuration. Spearman\u2019s rank correlation coefficient \u03c1 is given.\n(a) \u03c1 = 0.92\n (b) \u03c1 = 0.99\n(c) \u03c1 = 0.87\n (d) \u03c1 = 0.96\nFigure 3.5: 2D histograms displaying the correlation between the same score, from different\nensemble configurations. Spearman\u2019s rank correlation coefficient \u03c1 is given.\n26\n\nTable 3.3: Intersection over Union (IoU) between sets selected with different ensemble config-\nurations and three acquisition functions: entropy H, mutual information J, and random\n5best 20last 5ckpt single /\nH J H J H J H random\n5best H 1 0.37 0.56 0.29 0.53 0.27 0.49 0.06\nJ 1 0.30 0.38 0.39 0.34 0.26 0.06\n20last H 1 0.32 0.83 0.29 0.66 0.06\nJ 1 0.29 0.57 0.26 0.06\n5ckpt H 1 0.27 0.7 0.06\nJ 1 0.24 0.06\nsingle H 1 0.06\n/ random 1\n3.4.3. Visualization of ranked images\nThe visualization of top and bottom images in the score rankings is another insightful point.\nIn Fig.3.6, the bottom five images in U according to the entropy and mutual information rank-\nings for the 5best ensemble are displayed. Three of these images are shared between subsets.\nInterestingly, the model seems to easily classify images where the same species of moths is\npresented multiple times. In Fig. 3.8, the top five images in the entropy ranking are shown.\nThese images raise an alarm, as they have nothing to do in a training set for a classifier to be\nused on trap images such as those in Fig.3.2. In fact, they either don\u2019t display a live moth at\nall, or the moth occupies a minimal portion of the image, on top of being quite blended in the\nbackground. The previous are impossible to classify, in other words, they have a high aleatoric\nuncertainty. Hence, it makes sense that the entropy score is high (and, by the way, the least con-\nfidence and margin sampling scores as well), while the mutual information score is relatively\nlow. Finally, the top five images in the mutual information ranking are presented in Fig.3.7.", "mimetype": "text/plain", "start_char_idx": 50146, "end_char_idx": 53930, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "5084cc68-c3d7-4136-83c1-b8c48955719a": {"__data__": {"id_": "5084cc68-c3d7-4136-83c1-b8c48955719a", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4", "node_type": "4", "metadata": {}, "hash": "789f423aaa59148ae7466c9995a68374a1536300f0e2182e45f3794cf17cc305", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "bd3c9a8c-f95b-478e-a4a2-01ed7f774d64", "node_type": "1", "metadata": {}, "hash": "85e16b8a0e8695f02bde804b18582770dafff05984046e6af78a6d3018faffc9", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "a1381f9d-d625-43ab-8869-28f2ec055ddd", "node_type": "1", "metadata": {}, "hash": "ee414daa3c426bcf6f3196581f608e236016520542e82688ceebf063fd45feef", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "In Fig. 3.8, the top five images in the entropy ranking are shown.\nThese images raise an alarm, as they have nothing to do in a training set for a classifier to be\nused on trap images such as those in Fig.3.2. In fact, they either don\u2019t display a live moth at\nall, or the moth occupies a minimal portion of the image, on top of being quite blended in the\nbackground. The previous are impossible to classify, in other words, they have a high aleatoric\nuncertainty. Hence, it makes sense that the entropy score is high (and, by the way, the least con-\nfidence and margin sampling scores as well), while the mutual information score is relatively\nlow. Finally, the top five images in the mutual information ranking are presented in Fig.3.7.\nHere, with one exception (which has the highest entropy), the moths are well distinguishable\nfrom the background, but again they only occupy a very small portion of the image, which is\nchallenging for the models. Again, these images don\u2019t seem helpful at all for a model to be\nused on the images in Fig.3.2.\nIn light of these findings, when applied to the GBIF dataset, it is more likely that the uncertainty-\nbased active learning techniques might serve a different purpose than expected: to filter out\nimages that are too difficult and different from the trap images. Tests to see if the selected im-\nages would indeed hinder the model\u2019s learning \u2014by completing the first iteration of the active\nlearning evaluation\u2014 could not be completed in time, and are left for future research.\n27\n\nJ\n J\n J and H\n H\nH\n J and H\n J and H\nFigure 3.6: Bottom five images in the entropy ( H) and mutual information ( J) rankings, using\nthe 5best ensemble. Some images appear in both bottom fives.\n4.9\n 5.6\n 5.2\n 4.3\n 4.6\nFigure 3.7: Top five images in the mutual information ranking with5best. The entropy score for\neach image is given.\n28\n\n0.38\n 0.45\n 0.48\n0.66\n 0.39\nFigure 3.8: Top five images in the entropy ranking with 5best. The mutual information score of\neach image is given.\n29\n\nChapter 4\nConclusion\nThe main contributions from the work presented in this report are two-fold. First, the object\ndetection module has been significantly improved, both in terms of speed and accuracy. Ad-\nditionally, good foundations have been laid to facilitate further development of the project.\nSecond, uncertainty-based active learning techniques for the moth species classifier have been\nexplored. While this have been insightful, further research is needed to integrate active learn-\ning into the pipeline. In conclusion, it is worth mentioning the two main factors that have\nnegatively affected progress. The first was the lack of testing data. While tricks can some-\ntimes be applied to efficiently create training data \u2014as was done in this project\u2014, there is no\ngetting around the need for manual labor to build testing datasets. In order to reach a signif-\nicant scale, a coordinated effort is needed. Fortunately, such an effort is currently under way\nfrom collaborators both at Mila and outside, and testing datasets will become available in the\nfollowing months. The second factor has been the lack of effective collaboration inside the\nteam. Considering that the goal \u2014to develop a product\u2014 is akin to the one of a start-up, and\nas collaboration is naturally more challenging when colleagues work remotely from different\nregions of the world, I believe that the team would greatly benefit from the adoption of project\ndevelopment best practices that are standard in the industry.\n30\n\nBibliography\n[1] Richard G. Roberts, Timothy F. Flannery, Linda K. Ayliffe, Hiroyuki Yoshida, Jon M. Olley,\nGavin J. Prideaux, Geoff M. Laslett, Alexander Baynes, M. A. Smith, Rhys Jones, and\nBarton L. Smith. New ages for the last australian megafauna: Continent-wide extinction\nabout 46, 000 years ago. Science, 292(5523):1888\u20131892, June 2001.\n[2] Anthony D. Barnosky, Paul L. Koch, Robert S. Feranec, Scott L. Wing, and Alan B. Sha-\nbel. Assessing the causes of late pleistocene extinctions on the continents. Science,\n306(5693):70\u201375, October 2004.", "mimetype": "text/plain", "start_char_idx": 53193, "end_char_idx": 57256, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "a1381f9d-d625-43ab-8869-28f2ec055ddd": {"__data__": {"id_": "a1381f9d-d625-43ab-8869-28f2ec055ddd", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4", "node_type": "4", "metadata": {}, "hash": "789f423aaa59148ae7466c9995a68374a1536300f0e2182e45f3794cf17cc305", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "5084cc68-c3d7-4136-83c1-b8c48955719a", "node_type": "1", "metadata": {}, "hash": "d9840d158c2be51dc769a2c15c70a2a36e78b43ca9f8a486a89745767e4b879e", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "7e4d7d91-e79e-49e5-8c71-8a48fc38cb46", "node_type": "1", "metadata": {}, "hash": "edee2168126ada7c3c4764b1e53036ca689cd0ddc87439c86eee915056ffd0d7", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "30\n\nBibliography\n[1] Richard G. Roberts, Timothy F. Flannery, Linda K. Ayliffe, Hiroyuki Yoshida, Jon M. Olley,\nGavin J. Prideaux, Geoff M. Laslett, Alexander Baynes, M. A. Smith, Rhys Jones, and\nBarton L. Smith. New ages for the last australian megafauna: Continent-wide extinction\nabout 46, 000 years ago. Science, 292(5523):1888\u20131892, June 2001.\n[2] Anthony D. Barnosky, Paul L. Koch, Robert S. Feranec, Scott L. Wing, and Alan B. Sha-\nbel. Assessing the causes of late pleistocene extinctions on the continents. Science,\n306(5693):70\u201375, October 2004.\n[3] Luciano Prates and S. Ivan Perez. Late pleistocene south american megafaunal extinctions\nassociated with rise of fishtail points and human population.Nature Communications, 12(1),\nApril 2021.\n[4] Sander van der Kaars, Gifford H. Miller, Chris S. M. Turney, Ellyn J. Cook, Dirk N\u00fcrnberg,\nJoachim Sch\u00f6nfeld, A. Peter Kershaw, and Scott J. Lehman. Humans rather than climate\nthe primary cause of pleistocene megafaunal extinction in australia. Nature Communica-\ntions, 8(1), January 2017.\n[5] Gerardo Ceballos, Paul R. Ehrlich, and Rodolfo Dirzo. Biological annihilation via the\nongoing sixth mass extinction signaled by vertebrate population losses and declines. Pro-\nceedings of the National Academy of Sciences, 114(30):E6089\u2013E6096, 2017.\n[6] Robert H. Cowie, Philippe Bouchet, and Beno\u00eet Fontaine. The sixth mass extinction: fact,\nfiction or speculation? Biological Reviews, 97(2):640\u2013663, January 2022.\n[7] Anthony D. Barnosky, Nicholas Matzke, Susumu Tomiya, Guinevere O. U. Wogan, Brian\nSwartz, Tiago B. Quental, Charles Marshall, Jenny L. McGuire, Emily L. Lindsey, Kaitlin C.\nMaguire, Ben Mersey, and Elizabeth A. Ferrer. Has the earth\u2019s sixth mass extinction al-\nready arrived? Nature, 471(7336):51\u201357, March 2011.\n[8] Aelys M. Humphreys, Rafa\u00ebl Govaerts, Sarah Z. Ficinski, Eimear Nic Lughadha, and\nMaria S. Vorontsova. Global dataset shows geography and life form predict modern plant\nextinction and rediscovery. Nature Ecology & Evolution, 3(7):1043\u20131047, June 2019.\n[9] Hillary S. Young, Douglas J. McCauley, Mauro Galetti, and Rodolfo Dirzo. Patterns,\ncauses, and consequences of anthropocene defaunation. Annual Review of Ecology, Evo-\nlution, and Systematics, 47:333\u2013358, 2016.\n[10] Arthur D. Chapman. Numbers of living species in australia and the world. Technical\nreport, Australian Biodiversity Information Services, 2009.\n[11] Nigel E. Stork. How many species of insects and other terrestrial arthropods are there on\nearth? Annual Review of Entomology, 63(1):31\u201345, January 2018.\n31\n\n[12] Caspar A. Hallmann, Martin Sorg, Eelke Jongejans, Henk Siepel, Nick Hofland, Heinz\nSchwan, Werner Stenmans, Andreas M\u00fcller, Hubert Sumser, Thomas H\u00f6rren, Dave Goul-\nson, and Hans de Kroon. More than 75 percent decline over 27 years in total flying insect\nbiomass in protected areas. PLOS ONE, 12(10):e0185809, October 2017.", "mimetype": "text/plain", "start_char_idx": 56701, "end_char_idx": 59594, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "7e4d7d91-e79e-49e5-8c71-8a48fc38cb46": {"__data__": {"id_": "7e4d7d91-e79e-49e5-8c71-8a48fc38cb46", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4", "node_type": "4", "metadata": {}, "hash": "789f423aaa59148ae7466c9995a68374a1536300f0e2182e45f3794cf17cc305", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "a1381f9d-d625-43ab-8869-28f2ec055ddd", "node_type": "1", "metadata": {}, "hash": "ee414daa3c426bcf6f3196581f608e236016520542e82688ceebf063fd45feef", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "379dd8d5-bf98-4615-93a9-cba410df45ec", "node_type": "1", "metadata": {}, "hash": "3f1b77f5daba08528b49c812712da76e3346a3163aadfc64c9dfc3d3a212a9f6", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "[10] Arthur D. Chapman. Numbers of living species in australia and the world. Technical\nreport, Australian Biodiversity Information Services, 2009.\n[11] Nigel E. Stork. How many species of insects and other terrestrial arthropods are there on\nearth? Annual Review of Entomology, 63(1):31\u201345, January 2018.\n31\n\n[12] Caspar A. Hallmann, Martin Sorg, Eelke Jongejans, Henk Siepel, Nick Hofland, Heinz\nSchwan, Werner Stenmans, Andreas M\u00fcller, Hubert Sumser, Thomas H\u00f6rren, Dave Goul-\nson, and Hans de Kroon. More than 75 percent decline over 27 years in total flying insect\nbiomass in protected areas. PLOS ONE, 12(10):e0185809, October 2017.\n[13] Sebastian Seibold, Martin M. Gossner, Nadja K. Simons, Nico Bl\u00fcthgen, J\u00f6rg M\u00fcller, Di-\ndem Ambarl\u0131, Christian Ammer, J\u00fcrgen Bauhus, Markus Fischer, Jan C. Habel, Karl Ed-\nuard Linsenmair, Thomas Nauss, Caterina Penone, Daniel Prati, Peter Schall, Ernst-Detlef\nSchulze, Juliane Vogt, Stephan W\u00f6llauer, and Wolfgang W. Weisser. Arthropod decline in\ngrasslands and forests is associated with landscape-level drivers. Nature, 574(7780):671\u2013\n674, October 2019.\n[14] Roel van Klink, Diana E. Bowler, Konstantin B. Gongalsky, Ann B. Swengel, Alessandro\nGentile, and Jonathan M. Chase. Meta-analysis reveals declines in terrestrial but increases\nin freshwater insect abundances. Science, 368(6489):417\u2013420, April 2020.\n[15] Graham A. Montgomery, Robert R. Dunn, Richard Fox, Eelke Jongejans, Simon R. Leather,\nManu E. Saunders, Chris R. Shortall, Morgan W. Tingley, and David L. Wagner. Is the\ninsect apocalypse upon us? how to find out. Biological Conservation, 241:108327, 2020.\n[16] G. W. Hopkins and R. P . Freckleton. Declines in the numbers of amateur and professional\ntaxonomists: implications for conservation. Animal Conservation, 5(3):245\u2013249, August\n2002.\n[17] Erica Fleishman and Dennis D. Murphy. A realistic assessment of the indicator potential of\nbutterflies and other charismatic taxonomic groups. Conservation Biology, 23(5):1109\u20131116,\nOctober 2009.\n[18] Kim Bjerge, Jakob Bonde Nielsen, Martin Videb\u00e6k Sepstrup, Flemming Helsing-Nielsen,\nand Toke Thomas H\u00f8ye. An automated light trap to monitor moths (lepidoptera) using\ncomputer vision-based tracking and deep learning. Sensors, 21(2), 2021.\n[19] Michael J. Furlong, Denis J. Wright, and Lloyd M. Dosdall. Diamondback moth ecol-\nogy and management: Problems, progress, and prospects. Annual Review of Entomology,\n58(1):517\u2013541, January 2013.\n[20] Maartje J. Klapwijk, Gy\u00f6rgy Cs\u00f3ka, Anik\u00f3 Hirka, and Christer Bj\u00f6rkman. Forest in-\nsects and climate change: long-term trends in herbivore damage. Ecology and Evolution,\n3(12):4183\u20134196, September 2013.\n[21] Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. Deep residual learning for\nimage recognition, 2015.\n[22] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn: Towards real-time\nobject detection with region proposal networks, 2015.\n[23] Tsung-Yi Lin, Piotr Doll\u00e1r, Ross Girshick, Kaiming He, Bharath Hariharan, and Serge Be-\nlongie. Feature pyramid networks for object detection, 2017.\n[24] Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura\nGustafson, Tete Xiao, Spencer Whitehead, Alexander C. Berg, Wan-Yen Lo, Piotr Doll\u00e1r,\nand Ross Girshick.", "mimetype": "text/plain", "start_char_idx": 58956, "end_char_idx": 62211, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "379dd8d5-bf98-4615-93a9-cba410df45ec": {"__data__": {"id_": "379dd8d5-bf98-4615-93a9-cba410df45ec", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "5cdab5af-82e9-4b3d-9db0-1792e6438fd4", "node_type": "4", "metadata": {}, "hash": "789f423aaa59148ae7466c9995a68374a1536300f0e2182e45f3794cf17cc305", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "7e4d7d91-e79e-49e5-8c71-8a48fc38cb46", "node_type": "1", "metadata": {}, "hash": "edee2168126ada7c3c4764b1e53036ca689cd0ddc87439c86eee915056ffd0d7", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "[21] Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. Deep residual learning for\nimage recognition, 2015.\n[22] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn: Towards real-time\nobject detection with region proposal networks, 2015.\n[23] Tsung-Yi Lin, Piotr Doll\u00e1r, Ross Girshick, Kaiming He, Bharath Hariharan, and Serge Be-\nlongie. Feature pyramid networks for object detection, 2017.\n[24] Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura\nGustafson, Tete Xiao, Spencer Whitehead, Alexander C. Berg, Wan-Yen Lo, Piotr Doll\u00e1r,\nand Ross Girshick. Segment anything, 2023.\n32\n\n[25] Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang\nFu, and Alexander C. Berg. SSD: Single shot MultiBox detector. InComputer Vision \u2013 ECCV\n2016, pages 21\u201337. Springer International Publishing, 2016.\n[26] Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, and Liang-Chieh\nChen. Mobilenetv2: Inverted residuals and linear bottlenecks, 2019.\n[27] Andrew Howard, Mark Sandler, Grace Chu, Liang-Chieh Chen, Bo Chen, Mingxing Tan,\nWeijun Wang, Yukun Zhu, Ruoming Pang, Vijay Vasudevan, Quoc V . Le, and Hartwig\nAdam. Searching for mobilenetv3, 2019.\n[28] Shifeng Zhang, Cheng Chi, Yongqiang Yao, Zhen Lei, and Stan Z. Li. Bridging the gap\nbetween anchor-based and anchor-free detection via adaptive training sample selection,\n2020.\n[29] Ozan Sener and Silvio Savarese. Active learning for convolutional neural networks: A\ncore-set approach, 2017.\n[30] Mingfei Wu, Chen Li, and Zehuan Yao. Deep active learning for computer vision tasks:\nMethodologies, applications, and challenges. Applied Sciences, 12(16), 2022.\n[31] Vu-Linh Nguyen, S\u00e9bastien Destercke, and Eyke H\u00fcllermeier. Epistemic uncertainty sam-\npling, 2019.\n[32] Balaji Lakshminarayanan, Alexander Pritzel, and Charles Blundell. Simple and scalable\npredictive uncertainty estimation using deep ensembles, 2017.\n[33] Yarin Gal, Riashat Islam, and Zoubin Ghahramani. Deep bayesian active learning with\nimage data, 2017.\n[34] William H. Beluch, Tim Genewein, Andreas Nurnberger, and Jan M. Kohler. The power\nof ensembles for active learning in image classification. In 2018 IEEE/CVF Conference on\nComputer Vision and Pattern Recognition, pages 9368\u20139377, 2018.\n[35] Remus Pop and Patric Fulop. Deep ensemble bayesian active learning : Addressing the\nmode collapse issue in monte carlo dropout via ensembles, 2018.\n[36] Kashyap Chitta, Jose M. Alvarez, Elmar Haussmann, and Clement Farabet. Training data\nsubset search with ensemble active learning, 2020.\n33", "mimetype": "text/plain", "start_char_idx": 61611, "end_char_idx": 64197, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}}, "docstore/ref_doc_info": {"5cdab5af-82e9-4b3d-9db0-1792e6438fd4": {"node_ids": ["a31b64b9-d4c7-4ec6-8ca0-10d085d48205", "b17212a2-233f-492c-b53b-6a03c84d9f4f", "21f04a43-0875-4b86-94e4-93b11843969f", "35cb1ff3-4a4f-4616-a7c1-af26f0b9d1e9", "db73e86c-4c24-4992-b2c4-96c3cb23e27e", "66023780-9b49-46a9-9204-8a4786f537e9", "3ed814ba-a0a4-4328-802d-91780bc5964e", "2ce891bb-b95c-457c-859e-7a8980eb98c2", "04a0e9e2-4399-4f8e-9bb7-4a92ca7f3dfa", "39a9562c-9040-4276-b6a3-0686a44bcf50", "fe41da69-d1ee-4ce6-8888-34e9c9a759b5", "5e20f7b6-ac06-4a68-913b-419a2b585f5d", "c0f8c088-1ff8-4839-a325-2522f014510a", "e29d9346-c846-4203-8c35-b8bb1e8fa481", "004767a1-85b2-466e-b117-b264a4667205", "bd3c9a8c-f95b-478e-a4a2-01ed7f774d64", "5084cc68-c3d7-4136-83c1-b8c48955719a", "a1381f9d-d625-43ab-8869-28f2ec055ddd", "7e4d7d91-e79e-49e5-8c71-8a48fc38cb46", "379dd8d5-bf98-4615-93a9-cba410df45ec"], "metadata": {}}}}