Wabgaonkar, H. M., Sengupta, S., & Saha, T. (2022). Virtual Agent with Dialogue Management System and Method of Training a Dialogue Management System. U.S. Patent .
@misc{harshawardhan,
title = {Virtual Agent with Dialogue Management System and Method of Training a Dialogue Management System},
author = {Wabgaonkar, Harshawardhan Madhukar and Sengupta, Shubhashis and Saha, Tulika},
publisher = {U.S. Patent },
year = {2022},
doi = {https://patentscope.wipo.int/search/en/detail.jsf?docId=US279624169&docAn=16008337}
}
A virtual agent with a dialogue management system and a method of training the dialogue management system is disclosed. The dialogue management system is trained using a deep reinforcement learning process. Training involves obtaining or simulating training dialogue data. During the training process, actions for the dialogue management system are selected using a Deep Q Network to process observations. The Deep Q Network is updated using a target function that includes a reward. The reward may be generated by considering one or more of the following metrics: task completion percentage, dialogue length, sentiment analysis of the user’s response, emotional analysis of the user’s state, explicit user feedback, and assessed quality of the action. The set of actions that the dialogue management system can take at any time may be limited by an action screener that predicts the subset of actions that the agent should consider for a given state of the system.
Preprints
Tomar, M., Tiwari, A., Saha, T., Jha, P., & Saha, S. (2024). An EcoSage Assistant: Towards Building A Multimodal Plant Care Dialogue Assistant.
@unpublished{tomar2024ecosage,
title = {An EcoSage Assistant: Towards Building A Multimodal Plant Care Dialogue Assistant},
author = {Tomar, Mohit and Tiwari, Abhisek and Saha, Tulika and Jha, Prince and Saha, Sriparna},
year = {2024},
eprint = {2401.06807},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
Refereed Conference Proceedings
Jain, R., Saha, T., Lalwani, J., & Saha, S. (2023). Can you Summarize my learnings? Towards Perspective-based Educational Dialogue Summarization. In H. Bouamor, J. Pino, & K. Bali (Eds.), Findings of the Association for Computational Linguistics: EMNLP 2023 (pp. 3158–3173). Association for Computational Linguistics. https://aclanthology.org/2023.findings-emnlp.208
@inproceedings{jain-etal-2023-summarize,
title = {Can you Summarize my learnings? Towards Perspective-based Educational Dialogue Summarization},
author = {Jain, Raghav and Saha, Tulika and Lalwani, Jhagrut and Saha, Sriparna},
editor = {Bouamor, Houda and Pino, Juan and Bali, Kalika},
booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2023},
month = dec,
year = {2023},
address = {Singapore},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2023.findings-emnlp.208},
doi = {10.18653/v1/2023.findings-emnlp.208},
pages = {3158--3173}
}
The steady increase in the utilization of Virtual Tutors (VT) over recent years has allowed for a more efficient, personalized, and interactive AI-based learning experiences. A vital aspect in these educational chatbots is summarizing the conversations between the VT and the students, as it is critical in consolidating learning points and monitoring progress. However, the approach to summarization should be tailored according to the perspective. Summarization from the VTs perspective should emphasize on its teaching efficiency and potential improvements. Conversely, student-oriented summaries should distill learning points, track progress, and suggest scope for improvements. Based on this hypothesis, in this work, we propose a new task of Multi-modal Perspective based Dialogue Summarization (MM-PerSumm), demonstrated in an educational setting. Towards this aim, we introduce a novel dataset, CIMA-Summ that summarizes educational dialogues from three unique perspectives: the Student, the Tutor, and a Generic viewpoint. In addition, we propose an Image and Perspective-guided Dialogue Summarization (IP-Summ) model which is a Seq2Seq language model incorporating (i) multi-modal learning from images and (ii) a perspective-based encoder that constructs a dialogue graph capturing the intentions and actions of both the VT and the student, enabling the summarization of a dialogue from diverse perspectives. Lastly, we conduct detailed analyses of our model’s performance, highlighting the aspects that could lead to optimal modeling of IP-Summ.
Jain, R., Saha, T., & Saha, S. (2023). T-VAKS: A Tutoring-Based Multimodal Dialog System via Knowledge Selection. ECAI 2023, 1132–1139. https://doi.org/10.3233/FAIA230388
@inproceedings{jain2023t,
title = {T-VAKS: A Tutoring-Based Multimodal Dialog System via Knowledge Selection},
author = {Jain, Raghav and Saha, Tulika and Saha, Sriparna},
booktitle = {ECAI 2023},
pages = {1132--1139},
year = {2023},
publisher = {IOS Press},
url = {https://doi.org/10.3233/FAIA230388},
doi = {10.3233/FAIA230388}
}
Advancements in Conversational Natural Language Processing (NLP) have the potential to address critical social challenges, particularly in achieving the United Nations’ Sustainable Development Goal of quality education. However, the application of NLP in the educational domain, especially language learning, has been limited due to the inherent complexities of the field and the scarcity of available datasets. In this paper, we introduce T-VAKS (Tutoring Virtual Agent with Knowledge Selection), a novel language tutoring multimodal Virtual Agent (VA) designed to assist students in learning a new language, thereby promoting AI for Social Good. T-VAKS aims to bridge the gap between NLP and the educational domain, enabling more effective language tutoring through intelligent virtual agents. Our approach employs an information theory-based knowledge selection module built on top of a multimodal seq2seq generative model, facilitating the generation of appropriate, informative, and contextually relevant tutor responses. The knowledge selection module in turn consists of two sub-modules: (i) knowledge relevance estimation, and (ii) knowledge focusing framework. We evaluate the performance of our proposed end-to-end dialog system against various baseline models and the most recent state-of-the-art models, using multiple evaluation metrics. The results demonstrate that T-VAKS outperforms competing models, highlighting the potential of our approach in enhancing language learning through the use of conversational NLP and virtual agents, ultimately contributing to addressing social challenges and promoting well-being.
Tomar, M., Tiwari, A., Saha, T., & Saha, S. (2023). Your tone speaks louder than your face! Modality Order Infused Multi-modal Sarcasm Detection. Proceedings of the 31st ACM International Conference on Multimedia, 3926–3933. https://doi.org/10.1145/3581783.3612528
@inproceedings{10.1145/3581783.3612528,
author = {Tomar, Mohit and Tiwari, Abhisek and Saha, Tulika and Saha, Sriparna},
title = {Your tone speaks louder than your face! Modality Order Infused Multi-modal Sarcasm Detection},
year = {2023},
isbn = {9798400701085},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3581783.3612528},
doi = {10.1145/3581783.3612528},
booktitle = {Proceedings of the 31st ACM International Conference on Multimedia},
pages = {3926–3933},
numpages = {8},
keywords = {sarcasm, multi-party conversations, multi-modality, multi-modal fusion, contextual attention},
location = {, Ottawa ON, Canada, },
series = {MM '23}
}
Figurative language is an essential component of human communication, and detecting sarcasm in text has become a challenging yet highly popular task in natural language processing. As humans, we rely on a combination of visual and auditory cues, such as facial expressions and tone of voice, to comprehend a message. Our brains are implicitly trained to integrate information from multiple senses to form a complete understanding of the message being conveyed, a process known as multi-sensory integration. The combination of different modalities not only provides additional information but also amplifies the information conveyed by each modality in relation to the others. Thus, the infusion order of different modalities also plays a significant role in multimodal processing. In this paper, we investigate the impact of different modality infusion orders for identifying sarcasm in dialogues. We propose a modality order-driven module integrated into a transformer network, MO-Sarcation that fuses modalities in an ordered manner. Our model outperforms several state-of-the-art models by 1-3% across various metrics, demonstrating the crucial role of modality order in sarcasm detection. The obtained improvements and detailed analysis show that audio tone should be infused with textual content, followed by visual information to identify sarcasm efficiently. The code and dataset are available at https://github.com/mohit2b/MO-Sarcation.
Saha, T., Ganguly, D., Saha, S., & Mitra, P. (2023). Workshop On Large Language Models’ Interpretability and Trustworthiness (LLMIT). Proceedings of the 32nd ACM International Conference on Information and Knowledge Management, 5290–5293. https://doi.org/10.1145/3583780.3615311
@inproceedings{10.1145/3583780.3615311,
author = {Saha, Tulika and Ganguly, Debasis and Saha, Sriparna and Mitra, Prasenjit},
title = {Workshop On Large Language Models' Interpretability and Trustworthiness (LLMIT)},
year = {2023},
isbn = {9798400701245},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3583780.3615311},
doi = {10.1145/3583780.3615311},
booktitle = {Proceedings of the 32nd ACM International Conference on Information and Knowledge Management},
pages = {5290–5293},
numpages = {4},
keywords = {explainability, in-context learning, interpretability, large language model, trustworthiness},
location = {, Birmingham, United Kingdom, },
series = {CIKM '23}
}
Large language models (LLMs), when scaled from millions to billions of parameters, have been demonstrated to exhibit the so-called ’emergence’ effect, in that they are not only able to produce semantically correct and coherent text, but are also able to adapt themselves surprisingly well with small changes in contexts supplied as inputs (commonly called prompts). Despite producing semantically coherent and potentially relevant text for a given context, LLMs are vulnerable to yield incorrect information. This misinformation generation, or the so-called hallucination problem of an LLM, gets worse when an adversary manipulates the prompts to their own advantage, e.g., generating false propaganda to disrupt communal harmony, generating false information to trap consumers with target consumables etc. Not only does the consumption of an LLM-generated hallucinated content by humans pose societal threats, such misinformation, when used as prompts, may lead to detrimental effects for in-context learning (also known as few-shot prompt learning). With reference to the above-mentioned problems of LLM usage, we argue that it is necessary to foster research on topics related to not only identifying misinformation from LLM-generated content, but also to mitigate the propagation effects of this generated misinformation on downstream predictive tasks thus leading to more robust and effective leveraging in-context learning.
Saha, T., Tiwari, A., & Saha, S. (2023). Trends and Overview: The Potential of Conversational Agents in Digital Health. In J. Kamps, L. Goeuriot, F. Crestani, M. Maistro, H. Joho, B. Davis, C. Gurrin, U. Kruschwitz, & A. Caputo (Eds.), Advances in Information Retrieval (pp. 349–356). Springer Nature Switzerland.
@inproceedings{101007978303128241636,
author = {Saha, Tulika and Tiwari, Abhishek and Saha, Sriparna},
editor = {Kamps, Jaap and Goeuriot, Lorraine and Crestani, Fabio and Maistro, Maria and Joho, Hideo and Davis, Brian and Gurrin, Cathal and Kruschwitz, Udo and Caputo, Annalina},
title = {Trends and Overview: The Potential of Conversational Agents in Digital Health},
booktitle = {Advances in Information Retrieval},
year = {2023},
publisher = {Springer Nature Switzerland},
address = {Cham},
pages = {349--356},
isbn = {978-3-031-28241-6}
}
With the COVID-19 pandemic serving as a trigger, 2020 saw an unparalleled global expansion of tele-health [23]. Tele-health successfully lowers the need for in-person consultations and, thus, the danger of contracting a virus. While the COVID-19 pandemic sped up the adoption of virtual healthcare delivery in numerous nations, it also accelerated the creation of a wide range of other different technology-enabled systems and procedures for providing virtual healthcare to patients. Rightly so, the COVID-19 has brought many difficulties for patients (https://www.who.int/news/item/02-03-2022-covid-19-pandemic-triggers-25-increase-in-prevalence-of-anxiety-and-depression-worldwide) who need continuing care and monitoring for mental health issues and/or other chronic diseases.
Chandra, M., Ganguly, D., Saha, T., & Ounis, I. (2023). ’Choose your Data Wisely’: Active Learning based Selection with Multi-Objective Optimisation for Mitigating Stereotypes. Proceedings of the 32nd ACM International Conference on Information and Knowledge Management, 3768–3772. https://doi.org/10.1145/3583780.3615261
@inproceedings{10.1145/3583780.3615261,
author = {Chandra, Manish and Ganguly, Debasis and Saha, Tulika and Ounis, Iadh},
title = { 'Choose your Data Wisely': Active Learning based Selection with Multi-Objective Optimisation for Mitigating Stereotypes},
year = {2023},
isbn = {9798400701245},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3583780.3615261},
doi = {10.1145/3583780.3615261},
booktitle = {Proceedings of the 32nd ACM International Conference on Information and Knowledge Management},
pages = {3768–3772},
numpages = {5},
keywords = {active learning, fairness, multi-objective learning},
location = {, Birmingham, United Kingdom, },
series = {CIKM '23}
}
Data-driven (deep) learning methods has led to parameterised abstractions of the data, often leading to stereotype societal biases in their predictions, e.g., predicting more frequently that women are weaker than men, or that African Americans are more likely to commit crimes than Caucasians. Standard approaches of mitigating such stereotypical biases from deep neural models include modifying the training dataset (pre-processing), or adjusting the model parameters with a bias-specific objective (in-processing). In our work, we approach this bias mitigation from a different perspective - that of an active learning-based selection of a subset of data instances towards training a model optimised for both effectiveness and fairness. Specifically speaking, the imbalances in the attribute value priors can be alleviated by constructing a balanced subset of the data instances with two selection objectives - first, of improving the model confidence of the primary task itself (a standard practice in active learning), and the second, of taking into account the parity of the model predictions with respect to the sensitive attributes, such as gender and race etc. We demonstrate that our proposed selection function achieves better results in terms of both the primary task effectiveness and fairness. The results are further shown to improve when this active learning-based data selection is combined with an in-process method of multi-objective training.
Saha, T., Patra, A. P., Saha, S., & Bhattacharyya, P. (2022). Meta-Learning based Deferred Optimisation for Sentiment and Emotion aware Multi-modal Dialogue Act Classification. Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), 978–990. https://aclanthology.org/2022.aacl-main.71
@inproceedings{sahaetal2022meta,
title = {Meta-Learning based Deferred Optimisation for Sentiment and Emotion aware Multi-modal Dialogue Act Classification},
author = {Saha, Tulika and Patra, Aditya Prakash and Saha, Sriparna and Bhattacharyya, Pushpak},
booktitle = {Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)},
month = nov,
year = {2022},
address = {Online only},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2022.aacl-main.71},
doi = {https://aclanthology.org/2022.aacl-main.71},
pages = {978--990}
}
Dialogue Act Classification (DAC) that determines the communicative intention of an utterance has been investigated widely over the years as a standalone task. But the emotional state of the speaker has a considerable effect on its pragmatic content. Sentiment as a human behavior is also closely related to emotion and one aids in the better understanding of the other. Thus, their role in identification of DAs needs to be explored. As a first step, we extend the newly released multi-modal EMOTyDA dataset to enclose sentiment tags for each utterance. In order to incorporate these multiple aspects, we propose a Dual Attention Mechanism (DAM) based multi-modal, multi-tasking conversational framework. The DAM module encompasses intra-modal and interactive inter-modal attentions with multiple loss optimization at various hierarchies to fuse multiple modalities efficiently and learn generalized features across all the tasks. Additionally, to counter the class-imbalance issue in dialogues, we introduce a 2-step Deferred Optimisation Schedule (DOS) that involves Meta-Net (MN) learning and deferred re-weighting where the former helps to learn an explicit weighting function from data automatically and the latter deploys a re-weighted multi-task loss with a smaller learning rate. Empirically, we establish that the joint optimisation of multi-modal DAC, SA and ER tasks along with the incorporation of 2-step DOS and MN learning produces better results compared to its different counterparts and outperforms state-of-the-art model.
Xie, Q., Huang, J., Saha, T., & Ananiadou, S. (2022). GRETEL: Graph Contrastive Topic Enhanced Language Model for Long Document Extractive Summarization. Proceedings of the 29th International Conference on Computational Linguistics, 6259–6269. https://aclanthology.org/2022.coling-1.546
@inproceedings{xieetal2022gretel,
title = {{GRETEL}: Graph Contrastive Topic Enhanced Language Model for Long Document Extractive Summarization},
author = {Xie, Qianqian and Huang, Jimin and Saha, Tulika and Ananiadou, Sophia},
booktitle = {Proceedings of the 29th International Conference on Computational Linguistics},
month = oct,
year = {2022},
address = {Gyeongju, Republic of Korea},
publisher = {International Committee on Computational Linguistics},
url = {https://aclanthology.org/2022.coling-1.546},
pages = {6259--6269}
}
Recently, neural topic models (NTMs) have been incorporated into pre-trained language models (PLMs), to capture the global semantic information for text summarization. However, in these methods, there remain limitations in the way they capture and integrate the global semantic information. In this paper, we propose a novel model, the graph contrastive topic enhanced language model (GRETEL), that incorporates the graph contrastive topic model with the pre-trained language model, to fully leverage both the global and local contextual semantics for long document extractive summarization. To better capture and incorporate the global semantic information into PLMs, the graph contrastive topic model integrates the hierarchical transformer encoder and the graph contrastive learning to fuse the semantic information from the global document context and the gold summary. To this end, GRETEL encourages the model to efficiently extract salient sentences that are topically related to the gold summary, rather than redundant sentences that cover sub-optimal topics. Experimental results on both general domain and biomedical datasets demonstrate that our proposed method outperforms SOTA methods.
Saha, T., Reddy, S., Das, A., Saha, S., & Bhattacharyya, P. (2022). A Shoulder to Cry on: Towards A Motivational Virtual Assistant for Assuaging Mental Agony. Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, 2436–2449. https://aclanthology.org/2022.naacl-main.174
@inproceedings{sahaetal2022shoulder,
title = {A Shoulder to Cry on: Towards A Motivational Virtual Assistant for Assuaging Mental Agony},
author = {Saha, Tulika and Reddy, Saichethan and Das, Anindya and Saha, Sriparna and Bhattacharyya, Pushpak},
booktitle = {Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
month = jul,
year = {2022},
address = {Seattle, United States},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2022.naacl-main.174},
doi = {10.18653/v1/2022.naacl-main.174},
pages = {2436--2449}
}
Mental Health Disorders continue plaguing humans worldwide. Aggravating this situation is the severe shortage of qualified and competent mental health professionals (MHPs), which underlines the need for developing Virtual Assistants (VAs) that can assist MHPs. The data+ML for automation can come from platforms that allow visiting and posting messages in peer-to-peer anonymous manner for sharing their experiences (frequently stigmatized) and seeking support. In this paper, we propose a VA that can act as the first point of contact and comfort for mental health patients. We curate a dataset, Motivational VA: MotiVAte comprising of 7k dyadic conversations collected from a peer-to-peer support platform. The system employs two mechanisms: (i) Mental Illness Classification: an attention based BERT classifier that outputs the mental disorder category out of the 4 categories, viz., Major Depressive Disorder (MDD), Anxiety, Obsessive Compulsive Disorder (OCD) and Post-traumatic Stress Disorder (PTSD), based on the input ongoing dialog between the support seeker and the VA; and (ii) Mental Illness Conditioned Motivational Dialogue Generation (MI-MDG): a sentiment driven Reinforcement Learning (RL) based motivational response generator. The empirical evaluation demonstrates the system capability by way of outperforming several baselines.
Saha, T., Gakhreja, V., Das, A. S., Chakraborty, S., & Saha, S. (2022). Towards Motivational and Empathetic Response Generation in Online Mental Health Support. Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval, 2650–2656. https://doi.org/10.1145/3477495.3531912
@inproceedings{10114534774953531912,
author = {Saha, Tulika and Gakhreja, Vaibhav and Das, Anindya Sundar and Chakraborty, Souhitya and Saha, Sriparna},
title = {Towards Motivational and Empathetic Response Generation in Online Mental Health Support},
year = {2022},
isbn = {9781450387323},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3477495.3531912},
doi = {10.1145/3477495.3531912},
booktitle = {Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval},
pages = {2650–2656},
numpages = {7},
keywords = {empathy, motivation, mental health support, generation},
location = {Madrid, Spain},
series = {SIGIR '22}
}
The scarcity of Mental Health Professionals (MHPs) available to assist patients underlines the need for developing automated systems to help MHPs combat the grievous mental illness called Major Depressive Disorder. In this paper, we develop a Virtual Assistant (VA) that serves as a first point of contact for users who are depressed or disheartened. In support based conversations, two primary components have been identified to produce positive outcomes,empathy andmotivation. While empathy necessitates acknowledging the feelings of the users with a desire to help, imparting hope and motivation uplifts the spirit of support seekers in distress. A combination of these aspects will ensure generalized positive outcome and beneficial alliance in mental health support. The VA, thus, should be capable of generating empathetic and motivational responses, continuously demonstrating positive sentiment by the VA. The end-to-end system employs two mechanisms in a pipe-lined manner : (i)Motivational Response Generator (MRG) : a sentiment driven Reinforcement Learning (RL) based motivational response generator; and (ii)Empathetic Rewriting Framework (ERF) : a transformer based model that rewrites the response from MRG to induce empathy. Experimental results indicate that our proposed VA outperforms several of its counterparts. To the best of our knowledge, this is the first work that seeks to incorporate these aspects together in an end-to-end system.
Saha, T., & Ananiadou, S. (2022). Emotion-aware and Intent-controlled Empathetic Response Generation using Hierarchical Transformer Network. 2022 International Joint Conference on Neural Networks (IJCNN), 1–8.
@inproceedings{9892592,
author = {Saha, Tulika and Ananiadou, Sophia},
booktitle = {2022 International Joint Conference on Neural Networks (IJCNN)},
title = {Emotion-aware and Intent-controlled Empathetic Response Generation using Hierarchical Transformer Network},
year = {2022},
volume = {},
number = {},
pages = {1-8},
doi = {10.1109/IJCNN55064.2022.9892592}
}
Jain, R., Saha, T., Chakraborty, S., & Saha, S. (2022). Domain Infused Conversational Response Generation for Tutoring based Virtual Agent. 2022 International Joint Conference on Neural Networks (IJCNN), 1–8.
@inproceedings{9892890,
author = {Jain, Raghav and Saha, Tulika and Chakraborty, Souhitya and Saha, Sriparna},
booktitle = {2022 International Joint Conference on Neural Networks (IJCNN)},
title = {Domain Infused Conversational Response Generation for Tutoring based Virtual Agent},
year = {2022},
volume = {},
number = {},
pages = {1-8},
doi = {10.1109/IJCNN55064.2022.9892890}
}
Saha, T., Upadhyaya, A., Saha, S., & Bhattacharyya, P. (2021). Towards Sentiment and Emotion aided Multi-modal Speech Act Classification in Twitter. Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, 5727–5737. https://aclanthology.org/2021.naacl-main.456
@inproceedings{sahaetal2021towards,
title = {Towards Sentiment and Emotion aided Multi-modal Speech Act Classification in {T}witter},
author = {Saha, Tulika and Upadhyaya, Apoorva and Saha, Sriparna and Bhattacharyya, Pushpak},
booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
month = jun,
year = {2021},
address = {Online},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2021.naacl-main.456},
doi = {10.18653/v1/2021.naacl-main.456},
pages = {5727--5737}
}
Speech Act Classification determining the communicative intent of an utterance has been investigated widely over the years as a standalone task. This holds true for discussion in any fora including social media platform such as Twitter. But the emotional state of the tweeter which has a considerable effect on the communication has not received the attention it deserves. Closely related to emotion is sentiment, and understanding of one helps understand the other. In this work, we firstly create a new multi-modal, emotion-TA (‘TA’ means tweet act, i.e., speech act in Twitter) dataset called EmoTA collected from open-source Twitter dataset. We propose a Dyadic Attention Mechanism (DAM) based multi-modal, adversarial multi-tasking framework. DAM incorporates intra-modal and inter-modal attention to fuse multiple modalities and learns generalized features across all the tasks. Experimental results indicate that the proposed framework boosts the performance of the primary task, i.e., TA classification (TAC) by benefitting from the two secondary tasks, i.e., Sentiment and Emotion Analysis compared to its uni-modal and single task TAC (tweet act classification) variants.
Tiwari, A., Saha, T., Saha, S., Sengupta, S., Maitra, A., Ramnani, R., & Bhattacharyya, P. (2021). Multi-Modal Dialogue Policy Learning for Dynamic and Co-operative Goal Setting. 2021 International Joint Conference on Neural Networks (IJCNN), 1–8.
@inproceedings{9533878,
author = {Tiwari, Abhisek and Saha, Tulika and Saha, Sriparna and Sengupta, Shubhashis and Maitra, Anutosh and Ramnani, Roshni and Bhattacharyya, Pushpak},
booktitle = {2021 International Joint Conference on Neural Networks (IJCNN)},
title = {Multi-Modal Dialogue Policy Learning for Dynamic and Co-operative Goal Setting},
year = {2021},
volume = {},
number = {},
pages = {1-8},
doi = {10.1109/IJCNN52387.2021.9533878}
}
Saha, T., Chopra, S., Saha, S., Bhattacharyya, P., & Kumar, P. (2021). A Large-Scale Dataset for Motivational Dialogue System: An Application of Natural Language Generation to Mental Health. 2021 International Joint Conference on Neural Networks (IJCNN), 1–8.
@inproceedings{9533924,
author = {Saha, Tulika and Chopra, Saraansh and Saha, Sriparna and Bhattacharyya, Pushpak and Kumar, Pankaj},
booktitle = {2021 International Joint Conference on Neural Networks (IJCNN)},
title = {A Large-Scale Dataset for Motivational Dialogue System: An Application of Natural Language Generation to Mental Health},
year = {2021},
volume = {},
number = {},
pages = {1-8},
doi = {10.1109/IJCNN52387.2021.9533924}
}
Saha, T., Priya, N., Saha, S., & Bhattacharyya, P. (2021). A Transformer based Multi-task Model for Domain Classification, Intent Detection and Slot-Filling. 2021 International Joint Conference on Neural Networks (IJCNN), 1–8.
@inproceedings{ijcnn21nlu,
author = {Saha, Tulika and Priya, Neeti and Saha, Sriparna and Bhattacharyya, Pushpak},
booktitle = {2021 International Joint Conference on Neural Networks (IJCNN)},
title = {A Transformer based Multi-task Model for Domain Classification, Intent Detection and Slot-Filling},
year = {2021},
volume = {},
number = {},
pages = {1-8},
doi = {10.1109/IJCNN52387.2021.9533525}
}
Saha, T., Patra, A., Saha, S., & Bhattacharyya, P. (2020). Towards Emotion-aided Multi-modal Dialogue Act Classification. Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, 4361–4372. https://aclanthology.org/2020.acl-main.402
@inproceedings{sahaetal2020towards,
title = {Towards Emotion-aided Multi-modal Dialogue Act Classification},
author = {Saha, Tulika and Patra, Aditya and Saha, Sriparna and Bhattacharyya, Pushpak},
booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
month = jul,
year = {2020},
address = {Online},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2020.acl-main.402},
doi = {10.18653/v1/2020.acl-main.402},
pages = {4361--4372}
}
The task of Dialogue Act Classification (DAC) that purports to capture communicative intent has been studied extensively. But these studies limit themselves to text. Non-verbal features (change of tone, facial expressions etc.) can provide cues to identify DAs, thus stressing the benefit of incorporating multi-modal inputs in the task. Also, the emotional state of the speaker has a substantial effect on the choice of the dialogue act, since conversations are often influenced by emotions. Hence, the effect of emotion too on automatic identification of DAs needs to be studied. In this work, we address the role of both multi-modality and emotion recognition (ER) in DAC. DAC and ER help each other by way of multi-task learning. One of the major contributions of this work is a new dataset- multimodal Emotion aware Dialogue Act dataset called EMOTyDA, collected from open-sourced dialogue datasets. To demonstrate the utility of EMOTyDA, we build an attention based (self, inter-modal, inter-task) multi-modal, multi-task Deep Neural Network (DNN) for joint learning of DAs and emotions. We show empirically that multi-modality and multi-tasking achieve better performance of DAC compared to uni-modal and single task DAC variants.
Saha, T., Chopra, S., Saha, S., & Bhattacharyya, P. (2020). Reinforcement Learning Based Personalized Neural Dialogue Generation. In H. Yang, K. Pasupa, A. C.-S. Leung, J. T. Kwok, J. H. Chan, & I. King (Eds.), Neural Information Processing (pp. 709–716). Springer International Publishing.
@inproceedings{101007978303063820781,
author = {Saha, Tulika and Chopra, Saraansh and Saha, Sriparna and Bhattacharyya, Pushpak},
editor = {Yang, Haiqin and Pasupa, Kitsuchart and Leung, Andrew Chi-Sing and Kwok, James T. and Chan, Jonathan H. and King, Irwin},
title = {Reinforcement Learning Based Personalized Neural Dialogue Generation},
booktitle = {Neural Information Processing},
year = {2020},
publisher = {Springer International Publishing},
address = {Cham},
pages = {709--716},
isbn = {978-3-030-63820-7}
}
In this paper, we present a persona aware neural reinforcement learning response generation framework capable of optimizing long-term rewards carefully devised by system developers. The proposed model utilizes an extension of the recently introduced Hierarchical Encoder Decoder (HRED) architecture. We leverage insights from Reinforcement Learning (RL) and employ policy gradient methods to optimize rewards which are defined as simple heuristic approximations that indicate good conversation to a human mind. The proposed model is demonstrated on two benchmark datasets. Empirical results indicate that the proposed approach outperforms their counterparts that do not optimize long-term rewards, have no access to personas, standard models trained using solely maximum-likelihood estimation objective.
Saha, T., Patra, A., Saha, S., & Bhattacharyya, P. (2020). A Transformer based Approach for Identification of Tweet Acts. 2020 International Joint Conference on Neural Networks (IJCNN), 1–8. https://ieeexplore.ieee.org/abstract/document/9207484
@inproceedings{9207484,
author = {Saha, Tulika and {Patra}, A. and {Saha}, S. and {Bhattacharyya}, P.},
booktitle = {2020 International Joint Conference on Neural Networks (IJCNN)},
title = {A Transformer based Approach for Identification of Tweet Acts},
year = {2020},
volume = {},
number = {},
pages = {1-8},
doi = {10.1109/IJCNN48605.2020.9207484},
url = {https://ieeexplore.ieee.org/abstract/document/9207484}
}
Saha, T., Saha, S., & Bhattacharyya, P. (2020). Transfer Learning based Task-oriented Dialogue Policy for Multiple Domains using Hierarchical Reinforcement Learning. 2020 International Joint Conference on Neural Networks (IJCNN), 1–8.
@inproceedings{9206954,
author = {Saha, Tulika and Saha, Sriparna and Bhattacharyya, Pushpak},
booktitle = {2020 International Joint Conference on Neural Networks (IJCNN)},
title = {Transfer Learning based Task-oriented Dialogue Policy for Multiple Domains using Hierarchical Reinforcement Learning},
year = {2020},
volume = {},
number = {},
pages = {1-8},
doi = {10.1109/IJCNN48605.2020.9206954}
}
Saha, T., Srivastava, S., Firdaus, M., Saha, S., Ekbal, A., & Bhattacharyya, P. (2019). Exploring Machine Learning and Deep Learning Frameworks for Task-Oriented Dialogue Act Classification. 2019 International Joint Conference on Neural Networks (IJCNN), 1–8. https://ieeexplore.ieee.org/document/8851943
@inproceedings{8851943,
author = {Saha, Tulika and {Srivastava}, S. and {Firdaus}, M. and {Saha}, S. and {Ekbal}, A. and {Bhattacharyya}, P.},
booktitle = {2019 International Joint Conference on Neural Networks (IJCNN)},
title = {Exploring Machine Learning and Deep Learning Frameworks for Task-Oriented Dialogue Act Classification},
year = {2019},
volume = {},
number = {},
pages = {1-8},
doi = {10.1109/IJCNN.2019.8851943},
url = {https://ieeexplore.ieee.org/document/8851943}
}
Saha, T., Saha, S., & Bhattacharyya, P. (2019). Tweet Act Classification : A Deep Learning based Classifier for Recognizing Speech Acts in Twitter. 2019 International Joint Conference on Neural Networks (IJCNN), 1–8. https://ieeexplore.ieee.org/document/8851805
@inproceedings{8851805,
author = {Saha, Tulika and {Saha}, S. and {Bhattacharyya}, P.},
booktitle = {2019 International Joint Conference on Neural Networks (IJCNN)},
title = {Tweet Act Classification : A Deep Learning based Classifier for Recognizing Speech Acts in Twitter},
year = {2019},
volume = {},
number = {},
pages = {1-8},
doi = {10.1109/IJCNN.2019.8851805},
url = {https://ieeexplore.ieee.org/document/8851805}
}
Saha, T., Saha, S., & Bhattacharyya, P. (2018). Exploring Deep Learning Architectures Coupled with CRF Based Prediction for Slot-Filling. In L. Cheng, A. C. S. Leung, & S. Ozawa (Eds.), Neural Information Processing (pp. 214–225). Springer International Publishing.
@inproceedings{11007978303004167020,
author = {Saha, Tulika and Saha, Sriparna and Bhattacharyya, Pushpak},
editor = {Cheng, Long and Leung, Andrew Chi Sing and Ozawa, Seiichi},
title = {Exploring Deep Learning Architectures Coupled with CRF Based Prediction for Slot-Filling},
booktitle = {Neural Information Processing},
year = {2018},
publisher = {Springer International Publishing},
address = {Cham},
pages = {214--225},
isbn = {978-3-030-04167-0}
}
Slot-filling is one of the most crucial module of any dialogue system that focuses on extracting relevant and necessary information from the user utterances. In this paper, we propose variants of Long Short-Term Memory (LSTM) and Gated Recurrent Unit (GRU) models for the task of slot-filling which includes LSTM/GRU networks, Bi-directional LSTM/GRU (Bi-LSTM/GRU) networks, LSTM/GRU-CRF and Bi-LSTM/GRU-CRF networks. Variants of LSTM/GRU is used for discourse modeling i.e., to capture long term dependencies in the input sentences. A Conditional Random Field (CRF) layer is integrated with the above network to capture the sentence level tag information. We show the experimental results of our proposed model on the benchmark Air Travel Information System (ATIS) dataset which indicate that our model performed exceptionally well compared to the state of the art.
Saha, T., Gupta, D., Saha, S., & Bhattacharyya, P. (2018). Reinforcement Learning Based Dialogue Management Strategy. In L. Cheng, A. C. S. Leung, & S. Ozawa (Eds.), Neural Information Processing (pp. 359–372). Springer International Publishing.
@inproceedings{101007978303004182332,
author = {Saha, Tulika and Gupta, Dhawal and Saha, Sriparna and Bhattacharyya, Pushpak},
editor = {Cheng, Long and Leung, Andrew Chi Sing and Ozawa, Seiichi},
title = {Reinforcement Learning Based Dialogue Management Strategy},
booktitle = {Neural Information Processing},
year = {2018},
publisher = {Springer International Publishing},
address = {Cham},
pages = {359--372},
isbn = {978-3-030-04182-3}
}
This paper proposes a novel Markov Decision Process (MDP) to solve the problem of learning an optimal strategy by a Dialogue Manager for a flight enquiry system. A unique representation of state is presented followed by a relevant action set and a reward model which is specific to different time-steps. Different Reinforcement Learning (RL) algorithms based on classical methods and Deep Learning techniques have been implemented for the execution of the Dialogue Management component. To establish the robustness of the system, existing Slot-Filling (SF) module has been integrated with the system. The system can still generate valid responses to act sensibly even if the SF module falters. The experimental results indicate that the proposed MDP and the system hold promise to be scalable across satisfying the intent of the user.
Refereed Journal Articles
Saha, T., Reddy, S. M., Saha, S., & Bhattacharyya, P. (2023). Mental Health Disorder Identification From Motivational Conversations. IEEE Transactions on Computational Social Systems, 10(3), 1130–1139.
@article{9729467,
author = {Saha, Tulika and Reddy, Saichethan Miriyala and Saha, Sriparna and Bhattacharyya, Pushpak},
journal = {IEEE Transactions on Computational Social Systems},
title = {Mental Health Disorder Identification From Motivational Conversations},
year = {2023},
volume = {10},
number = {3},
pages = {1130-1139},
doi = {10.1109/TCSS.2022.3143763}
}
Tiwari, A., Saha, T., Saha, S., Bhattacharyya, P., Begum, S., Dhar, M., & Tiwari, S. (2022). Symptoms are known by their companies: towards association guided disease diagnosis assistant. BMC Bioinformatics, 23(1), 556. https://doi.org/10.1186/s12859-022-05032-y
@article{Tiwari2022,
author = {Tiwari, Abhisek and Saha, Tulika and Saha, Sriparna and Bhattacharyya, Pushpak and Begum, Shemim and Dhar, Minakshi and Tiwari, Sarbajeet},
title = {Symptoms are known by their companies: towards association guided disease diagnosis assistant},
journal = {BMC Bioinformatics},
year = {2022},
month = dec,
day = {22},
volume = {23},
number = {1},
pages = {556},
issn = {1471-2105},
doi = {10.1186/s12859-022-05032-y},
url = {https://doi.org/10.1186/s12859-022-05032-y}
}
Over the last few years, dozens of healthcare surveys have shown a shortage of doctors and an alarming doctor-population ratio. With the motivation of assisting doctors and utilizing their time efficiently, automatic disease diagnosis using artificial intelligence is experiencing an ever-growing demand and popularity. Humans are known by the company they keep; similarly, symptoms also exhibit the association property, i.e., one symptom may strongly suggest another symptom’s existence/non-existence, and their association provides crucial information about the suffering condition. The work investigates the role of symptom association in symptom investigation and disease diagnosis process. We propose and build a virtual assistant called Association guided Symptom Investigation and Diagnosis Assistant (A-SIDA) using hierarchical reinforcement learning. The proposed A-SIDDA converses with patients and extracts signs and symptoms as per patients’ chief complaints and ongoing dialogue context. We infused association-based recommendations and critic into the assistant, which reinforces the assistant for conducting context-aware, symptom-association guided symptom investigation. Following the symptom investigation, the assistant diagnoses a disease based on the extracted signs and symptoms. The assistant then diagnoses a disease based on the extracted signs and symptoms. In addition to diagnosis accuracy, the relevance of inspected symptoms is critical to the usefulness of a diagnosis framework. We also propose a novel evaluation metric called Investigation Relevance Score (IReS), which measures the relevance of symptoms inspected during symptom investigation. The obtained improvements (Diagnosis success rate-5.36%, Dialogue length-1.16, Match rate-2.19%, Disease classifier-6.36%, IReS-0.3501, and Human score-0.66) over state-of-the-art methods firmly establish the crucial role of symptom association that gets uncovered by the virtual agent. Furthermore, we found that the association guided symptom investigation greatly increases human satisfaction, owing to its seamless topic (symptom) transition.
Saha, T., Saha, S., & Bhattacharyya, P. (2022). Towards Sentiment-Aware Multi-Modal Dialogue Policy Learning. Cognitive Computation, 14(1), 246–260. https://doi.org/10.1007/s12559-020-09769-7
@article{Saha2022,
author = {Saha, Tulika and Saha, Sriparna and Bhattacharyya, Pushpak},
title = {Towards Sentiment-Aware Multi-Modal Dialogue Policy Learning},
journal = {Cognitive Computation},
year = {2022},
month = jan,
day = {01},
volume = {14},
number = {1},
pages = {246-260},
issn = {1866-9964},
doi = {10.1007/s12559-020-09769-7},
url = {https://doi.org/10.1007/s12559-020-09769-7}
}
Creation of task-oriented dialog/virtual agent (VA) capable of managing complex domain-specific user queries pertaining to multiple intents is difficult since the agent must deal with several subtasks simultaneously. Most end-to-end dialogue systems, however, only provide user semantics as inputs from texts into the learning process and neglect other useful user behaviour and information from other modalities such as images. This stresses the benefit of incorporating multi-modal inputs for eliciting user preference in the task. Also, sentiment of the user plays a significant role in achieving maximum user/customer satisfaction during the conversation. Thus, it is also important to incorporate users’ sentiments during policy learning, especially when serving user’s composite goals. For the creation of multi-modal VA aided with sentiment for conversations encompassing multi-intents, this paper introduces a new dataset, named Vis-SentiVA: Visual and Sentiment aided VA created from open-accessed conversational dataset. We present a hierarchical reinforcement learning (HRL) typically options-based VA to learn policies for serving multi-intent dialogues. Multi-modal information (texts and images) extraction to identify user’s preference is incorporated in the learning framework. A combination of task-based and sentiment-based rewards is integrated in the hierarchical value functions for the VA to be user adaptive. Empirically, we show that all these aspects induced together in the learning framework play a vital role in acquiring higher dialogue task success and increased user contentment in the process of creating composite-natured VAs. This is the first effort in integrating sentiment-aware rewards in the multi-modal HRL framework. The paper highlights that it is indeed essential to include other modes of information extraction such as images and behavioural cues of the user such as sentiment to secure greater user contentment. This also helps in improving success of composite-natured VAs serving task-oriented dialogues.
Saha, T., Upadhyaya, A., Saha, S., & Bhattacharyya, P. (2022). A Multitask Multimodal Ensemble Model for Sentiment- and Emotion-Aided Tweet Act Classification. IEEE Transactions on Computational Social Systems, 9(2), 508–517.
@article{9469005,
author = {Saha, Tulika and Upadhyaya, Apoorva and Saha, Sriparna and Bhattacharyya, Pushpak},
journal = {IEEE Transactions on Computational Social Systems},
title = {A Multitask Multimodal Ensemble Model for Sentiment- and Emotion-Aided Tweet Act Classification},
year = {2022},
volume = {9},
number = {2},
pages = {508-517},
doi = {10.1109/TCSS.2021.3088714}
}
Tiwari, A., Saha, T., Saha, S., Sengupta, S., Maitra, A., Ramnani, R., & Bhattacharyya, P. (2022). A persona aware persuasive dialogue policy for dynamic and co-operative goal setting. Expert Systems with Applications, 195, 116303. https://www.sciencedirect.com/science/article/pii/S0957417421016067
@article{TIWARI2022116303,
title = {A persona aware persuasive dialogue policy for dynamic and co-operative goal setting},
journal = {Expert Systems with Applications},
volume = {195},
pages = {116303},
year = {2022},
issn = {0957-4174},
doi = {https://doi.org/10.1016/j.eswa.2021.116303},
url = {https://www.sciencedirect.com/science/article/pii/S0957417421016067},
author = {Tiwari, Abhisek and Saha, Tulika and Saha, Sriparna and Sengupta, Shubhashis and Maitra, Anutosh and Ramnani, Roshni and Bhattacharyya, Pushpak},
keywords = {Conversational agents, Dynamic goals, Goal unavailability, E-commerce, Reinforcement learning, Persuasion}
}
Contextualization:
In recent years, the popularity of virtual agents particularly task-oriented dialogue agents has increased immensely due to their effectiveness and simplicity in various domains such as industry, e-commerce, and health.
Problem:
In real-world, users do not have always a predefined and immutable goal, i.e., they may upgrade/downgrade/update their task goal dynamically depending upon their utility and the serving capability of the assisting agent. However, the existing Virtual Agents (VAs) in the dialogue literature relinquish and yield dialogue failure if they find any dynamic goal setting or goal unavailability scenarios.
Contributions and methodology:
Motivated by these inabilities of existing VAs, we propose some intelligent and expert Dialogue Agents (A Unified Dialogue Agent and Multi-agent Dialogue system) that can deal with dynamic and goal unavailability situations to elevate both user satisfaction and the agent’s utility particularly task success rate. The proposed architecture incorporates a goal guiding module namely Dynamic and Co-Operative Goal Driven Module (DyCoGDM), which traces goal status and resolves goal discrepancy through dynamic goal setting (Goal Formulator) and personalized persuasion (Goal Persuader) mechanisms. We also created and annotated a dialogue corpus because of unavailability of such corpus featured with dynamic and goal unavailability scenarios.
Findings and implications:
Our proposed method outperforms several baselines and state of the art methods in all evaluation metrics. The proposed VA is capable of dealing with dynamic goals and goal unavailability scenarios effectively. The study found that the persona aware persuasive dialogue agent outperforms generalized persuasive dialogue agent by a large margin. Furthermore, we also observed that the task oriented reward is the most essential reward for training a reinforcement learning based agent and agents trained without task based reward do not even converge.
Saha, T., Gupta, D., Saha, S., & Bhattacharyya, P. (2021). A hierarchical approach for efficient multi-intent dialogue policy learning. Multimedia Tools and Applications, 80(28), 35025–35050. https://doi.org/10.1007/s11042-020-09070-7
@article{Saha2021,
author = {Saha, Tulika and Gupta, Dhawal and Saha, Sriparna and Bhattacharyya, Pushpak},
title = {A hierarchical approach for efficient multi-intent dialogue policy learning},
journal = {Multimedia Tools and Applications},
year = {2021},
month = nov,
day = {01},
volume = {80},
number = {28},
pages = {35025-35050},
issn = {1573-7721},
doi = {10.1007/s11042-020-09070-7},
url = {https://doi.org/10.1007/s11042-020-09070-7}
}
This paper proposes a hierarchical method for learning an efficient Dialogue Management (DM) strategy for task-oriented conversations serving multiple intents of a domain. Deep Reinforcement Learning (DRL) networks specializing in individual intents communicate with each other, having the capability of sharing overlapping information across intents. The sharing of information across state space and the presence of global slot tracker prohibits the agent to reask known information. Thus, the system is able to handle sub-dialogues based on subset of intents covered by different Reinforcement Learning (RL) models, thereby, completing the dialogue without again asking already provided information common across intents. The developed system has been demonstrated for “Air Travel” domain. The experimental results indicate that the developed system is efficient, scalable and can serve multiple intents based dialogues adequately. The proposed system when applied to 5-intent dialogue systems attains an improvement of 41% in terms of dialogue length as compared to a single-intent based system serving the same 5-intents.
Saha, T., Gupta, D., Saha, S., & Bhattacharyya, P. (2021). A Unified Dialogue Management Strategy for Multi-Intent Dialogue Conversations in Multiple Languages. ACM Trans. Asian Low-Resour. Lang. Inf. Process., 20(6). https://doi.org/10.1145/3461763
@article{1011453461763,
author = {Saha, Tulika and Gupta, Dhawal and Saha, Sriparna and Bhattacharyya, Pushpak},
title = {A Unified Dialogue Management Strategy for Multi-Intent Dialogue Conversations in Multiple Languages},
year = {2021},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {20},
number = {6},
issn = {2375-4699},
url = {https://doi.org/10.1145/3461763},
doi = {10.1145/3461763},
journal = {ACM Trans. Asian Low-Resour. Lang. Inf. Process.},
month = sep,
articleno = {99},
numpages = {22},
keywords = {options, hierarchical reinforcement learning, multi-intent, hierarchies, Dialogue management}
}
Building Virtual Agents capable of carrying out complex queries of the user involving multiple intents of a domain is quite a challenge, because it demands that the agent manages several subtasks simultaneously. This article presents a universal Deep Reinforcement Learning framework that can synthesize dialogue managers capable of working in a task-oriented dialogue system encompassing various intents pertaining to a domain. The conversation between agent and user is broken down into hierarchies, to segregate subtasks pertinent to different intents. The concept of Hierarchical Reinforcement Learning, particularly options, is used to learn policies in different hierarchies that operates in distinct time steps to fulfill the user query successfully. The dialogue manager comprises top-level intent meta-policy to select among subtasks or options and a low-level controller policy to pick primitive actions to communicate with the user to complete the subtask provided to it by the top-level policy in varying intents of a domain. The proposed dialogue management module has been trained in a way such that it can be reused for any language for which it has been developed with little to no supervision. The developed system has been demonstrated for “Air Travel” and “Restaurant” domain in English and Hindi languages. Empirical results determine the robustness and efficacy of the learned dialogue policy as it outperforms several baselines and a state-of-the-art system.
Tiwari, A., Saha, T., Saha, S., Sengupta, S., Maitra, A., Ramnani, R., & Bhattacharyya, P. (2021). A dynamic goal adapted task oriented dialogue agent. PLOS ONE, 16(4), 1–32. https://doi.org/10.1371/journal.pone.0249030
@article{10.1371journalpone0249030,
doi = {10.1371/journal.pone.0249030},
author = {Tiwari, Abhisek and Saha, Tulika and Saha, Sriparna and Sengupta, Shubhashis and Maitra, Anutosh and Ramnani, Roshni and Bhattacharyya, Pushpak},
journal = {PLOS ONE},
publisher = {Public Library of Science},
title = {A dynamic goal adapted task oriented dialogue agent},
year = {2021},
month = apr,
volume = {16},
url = {https://doi.org/10.1371/journal.pone.0249030},
pages = {1-32},
number = {4}
}
Purpose Existing virtual agents (VAs) present in dialogue systems are either information retrieval based or static goal-driven. However, in real-world situations, end-users might not have a known and fixed goal beforehand for the task, i.e., they may upgrade/downgrade/update their goal components in real-time to maximize their utility values. Existing VAs are unable to handle such dynamic goal-oriented situations. Methodology Due to the absence of any related dialogue dataset where such choice deviations are present, we have created a conversational dataset called Deviation adapted Virtual Agent(DevVA), with the manual annotation of its corresponding intents, slots, and sentiment labels. A Dynamic Goal Driven Dialogue Agent (DGDVA) has been developed by incorporating a Dynamic Goal Driven Module (GDM) on top of a deep reinforcement learning based dialogue manager. In the course of a conversation, the user sentiment provides grounded feedback about agent behavior, including goal serving action. User sentiment appears to be an appropriate indicator for goal discrepancy that guides the agent to complete the user’s desired task with gratification. The negative sentiment expressed by the user about an aspect of the provided choice is treated as a discrepancy that is being resolved by the GDM depending upon the observed discrepancy and current dialogue state. The goal update capability and the VA’s interactiveness trait enable end-users to accomplish their desired task satisfactorily. Findings The obtained experimental results illustrate that DGDVA can handle dynamic goals with maximum user satisfaction and a significantly higher success rate. The interaction drives the user to decide its final goal through the latent specification of possible choices and information retrieved and provided by the dialogue agent. Through the experimental results (qualitative and quantitative), we firmly conclude that the proposed sentiment-aware VA adapts users’ dynamic behavior for its goal setting with substantial efficacy in terms of primary objective i.e., task success rate (0.88). Practical implications In real world, it can be argued that many people do not have a predefined and fixed goal for tasks such as online shopping, movie booking & restaurant booking, etc. They tend to explore the available options first which are aligned with their minimum requirements and then decide one amongst them. The DGDVA provides maximum user satisfaction as it enables them to accomplish a dynamic goal that leads to additional utilities along with the essential ones. Originality To the best of our knowledge, this is the first effort towards the development of A Dynamic Goal Adapted Task-Oriented Dialogue Agent that can serve user goals dynamically until the user is satisfied.
Saha, T., Gupta, D., Saha, S., & Bhattacharyya, P. (2021). Emotion Aided Dialogue Act Classification for Task-Independent Conversations in a Multi-modal Framework. Cognitive Computation, 13(2), 277–289. https://doi.org/10.1007/s12559-019-09704-5
@article{Saha2023,
author = {Saha, Tulika and Gupta, Dhawal and Saha, Sriparna and Bhattacharyya, Pushpak},
title = {Emotion Aided Dialogue Act Classification for Task-Independent Conversations in a Multi-modal Framework},
journal = {Cognitive Computation},
year = {2021},
month = mar,
day = {01},
volume = {13},
number = {2},
pages = {277-289},
issn = {1866-9964},
doi = {10.1007/s12559-019-09704-5},
url = {https://doi.org/10.1007/s12559-019-09704-5}
}
Dialogue act classification (DAC) gives a significant insight into understanding the communicative intention of the user. Numerous machine learning (ML) and deep learning (DL) approaches have been proposed over the years in these regards for task-oriented/independent conversations in the form of texts. However, the affect of emotional state in determining the dialogue acts (DAs) has not been studied in depth in a multi-modal framework involving text, audio, and visual features. Conversations are intrinsically determined and regulated by direct, exquisite, and subtle emotions. The emotional state of a speaker has a considerable affect on its intentional or its pragmatic content. This paper thoroughly investigates the role of emotions in automatic identification of the DAs in task-independent conversations in a multi-modal framework (specifically audio and texts). A DL-based multi-tasking network for DAC and emotion recognition (ER) has been developed incorporating attention to facilitate the fusion of different modalities. An open source, benchmarked ER multi-modal dataset IEMOCAP has been manually annotated for its corresponding DAs to make it suitable for multi-task learning and further advance the research in multi-modal DAC. The proposed multi-task framework attains an improvement of 2.5% against its single-task DAC counterpart for manually annotated IEMOCAP dataset. Results as compared with several baselines establish the efficacy of the proposed approach and the importance of incorporating emotion while identifying the DAs.
Saha, T., Saha, S., & Bhattacharyya, P. (2020). Towards sentiment aided dialogue policy learning for multi-intent conversations using hierarchical reinforcement learning. PLOS ONE, 15(7), 1–28. https://doi.org/10.1371/journal.pone.0235367
@article{101371journalpone0235367,
doi = {10.1371/journal.pone.0235367},
author = {Saha, Tulika and Saha, Sriparna and Bhattacharyya, Pushpak},
journal = {PLOS ONE},
publisher = {Public Library of Science},
title = {Towards sentiment aided dialogue policy learning for multi-intent conversations using hierarchical reinforcement learning},
year = {2020},
month = jul,
volume = {15},
url = {https://doi.org/10.1371/journal.pone.0235367},
pages = {1-28},
number = {7}
}
Purpose Developing a Dialogue/Virtual Agent (VA) that can handle complex tasks (need) of the user pertaining to multiple intents of a domain is challenging as it requires the agent to simultaneously deal with multiple subtasks. However, majority of these end-to-end dialogue systems incorporate only user semantics as inputs in the learning process and ignore other useful user behavior and information. Sentiment of the user at the time of conversation plays an important role in securing maximum user gratification. So, incorporating sentiment of the user during the policy learning becomes even more crucial, more so when serving composite tasks of the user. Methodology As a first step towards enabling the development of sentiment aided VA for multi-intent conversations, this paper proposes a new dataset, annotated with its corresponding intents, slot and sentiment (considering the entire dialogue history) labels, named SentiVA, collected from open-sourced dialogue datasets. In order to integrate these multiple aspects, a Hierarchical Reinforcement Learning (HRL) specifically options based VA is proposed to learn strategies for managing multi-intent conversations. Along with task success based immediate rewards, sentiment based immediate rewards are also incorporated in the hierarchical value functions to make the VA user adaptive. Findings Empirically, the paper shows that task based and sentiment based immediate rewards cumulatively are required to ensure successful task completion and attain maximum user satisfaction in a multi-intent scenario instead of any of these rewards alone. Practical implications The eventual evaluators and consumers of dialogue systems are users. Thus, to ensure a fulfilling conversational experience involving maximum user satisfaction requires VA to consider user sentiment at every time-step in its decision making policy. Originality This work is the first attempt in incorporating sentiment based rewards in the HRL framework.
Saha, T., Gupta, D., Saha, S., & Bhattacharyya, P. (2020). Towards integrated dialogue policy learning for multiple domains and intents using Hierarchical Deep Reinforcement Learning. Expert Systems with Applications, 162, 113650. https://www.sciencedirect.com/science/article/pii/S0957417420304747
@article{SAHA2020113650,
title = {Towards integrated dialogue policy learning for multiple domains and intents using Hierarchical Deep Reinforcement Learning},
journal = {Expert Systems with Applications},
volume = {162},
pages = {113650},
year = {2020},
issn = {0957-4174},
doi = {10.1016/j.eswa.2020.113650},
url = {https://www.sciencedirect.com/science/article/pii/S0957417420304747},
author = {Saha, Tulika and Gupta, Dhawal and Saha, Sriparna and Bhattacharyya, Pushpak},
keywords = {Dialogue management, Multi-domain, Multi-intent, Hierarchical Reinforcement Learning, Options}
}
Creation of Expert and Intelligent Dialogue/Virtual Agent (VA) that can serve complicated and intricate tasks (need) of the user related to multiple domains and its various intents is indeed quite challenging as it necessitates the agent to concurrently handle multiple subtasks in different domains. This paper presents an expert, unified and a generic Deep Reinforcement Learning (DRL) framework that creates dialogue managers competent for managing task-oriented conversations embodying multiple domains along with their various intents and provide the user with an expert system which is a one stop for all queries. In order to address these multiple aspects, the dialogue exchange between the user and the VA is split into hierarchies, so as to isolate and identify subtasks belonging to different domains. The notion of Hierarchical Reinforcement Learning (HRL) specifically options is employed to learn optimal policies in these hierarchies that operate at varying time steps to accomplish the user goal. The dialogue manager encompasses a top-level domain meta-policy, intermediate-level intent meta-policies in order to select amongst varied and multiple subtasks or options and low-level controller policies to select primitive actions to complete the subtask given by the higher-level meta-policies in varying intents and domains. Sharing of controller policies among overlapping subtasks enables the meta-policies to be generic. The proposed expert framework has been demonstrated in the domains of “Air Travel” and “Restaurant”. Experiments as compared to several strong baselines and a state of the art model establish the efficiency of the learned policies and the need for such expert models capable of handling complex and composite tasks.
Saha, T., Ramesh Jayashree, S., Saha, S., & Bhattacharyya, P. (2020). BERT-Caps: A Transformer-Based Capsule Network for Tweet Act Classification. IEEE Transactions on Computational Social Systems, 7(5), 1168–1179.
@article{9199096,
author = {Saha, Tulika and Ramesh Jayashree, Srivatsa and Saha, Sriparna and Bhattacharyya, Pushpak},
journal = {IEEE Transactions on Computational Social Systems},
title = {BERT-Caps: A Transformer-Based Capsule Network for Tweet Act Classification},
year = {2020},
volume = {7},
number = {5},
pages = {1168-1179},
doi = {10.1109/TCSS.2020.3014128}
}